Libraries¶

In [1]:
# Upgrade -if needed

# !pip3 install openpyxl
# !pip3 install --upgrade joblib
# !pip3 install --upgrade matplotlib
# !pip3 install --upgrade numpy
# !pip3 install --upgrade pandas
# !pip3 install --upgrade scikit-learn
# !pip3 install --upgrade scipy
# !pip3 install --upgrade seaborn
# !pip3 install --upgrade xgboost
# !pip3 install --upgrade Keras
# !pip3 install --upgrade tensorflow

# not sure gia ta parakatw
# !pip3 install --upgrade hyperopt
# !pip3 install --upgrade scikit-optimize
# !pip3 install --upgrade optuna
# !pip3 install --upgrade xlrd

# Data management
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import scipy.stats as stats

# To display all columns in the dataset.
pd.set_option('display.max_columns', None)

# Visualisation
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# For the models that will be implemented
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# Metrics that will be used to evaluate/train the models
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Optimization of the model - Bayesian optimization
from skopt import BayesSearchCV

from sklearn.tree import DecisionTreeClassifier

# For the feature selection
#there are required for SequentialFeatureSelector to be used
from mlxtend.preprocessing import TransactionEncoder              
from mlxtend.frequent_patterns import association_rules,apriori   
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Interpretation
from sklearn.inspection import permutation_importance



# To avoid irrelevant (to the thesis) messages 
import warnings
warnings.simplefilter("ignore", UserWarning)

import warnings
warnings.filterwarnings('ignore')

Functions that will be used¶

  • To find optimal segmentations by using the Information Values criterion
In [2]:
def analyze_column_and_export(for_segmentation, column_name, output_file):
    # Ensure the necessary columns are available
    if 'Defaulted' not in for_segmentation.columns:
        raise ValueError("The dataset must contain a 'Defaulted' column.")

    # Calculate "Percent of Total Frequency" for each unique value
    total_count = for_segmentation[column_name].value_counts(normalize=True) * 100
    
    # Calculate the "Frequency Count" of Goods (Defaulted = 0)
    goods_count = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts()
    
    # Calculate "Percent of Column Frequency" for the Goods
    goods_percent = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts(normalize=True) * 100
    
    # Calculate the "Frequency Count" of the Bads (Defaulted = 1)
    bads_count = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts()
    
    # Calculate "Percent of Column Frequency" for the Bads
    bads_percent = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts(normalize=True) * 100
    
    # Create a DataFrame with the calculated values
    summary_df = pd.DataFrame({
        column_name: total_count.index,
        'Percent of Total Frequency': total_count.values,
        'Frequency Count (Goods)': goods_count,
        'Percent of Column Frequency (Goods)': goods_percent,
        'Frequency Count (Bads)': bads_count,
        'Percent of Column Frequency (Bads)': bads_percent,
    }).fillna(0)
    
    # Calculate the good-bad odds
    summary_df['Good-Bad Odds'] = summary_df['Frequency Count (Goods)'] / summary_df['Frequency Count (Bads)']
    summary_df['Good-Bad Odds'].replace([np.inf, -np.inf], 0, inplace=True)
    
    # Calculate the bad rate and convert it to a percentage
    summary_df['Bad Rate'] = (summary_df['Frequency Count (Bads)'] / (summary_df['Frequency Count (Goods)'] + summary_df['Frequency Count (Bads)'])) * 100
    
    # Calculate the weight of evidence (WOE)
    summary_df['WOE'] = np.log(np.where(summary_df['Percent of Column Frequency (Bads)'] == 0, np.nan, summary_df['Percent of Column Frequency (Goods)'] / summary_df['Percent of Column Frequency (Bads)']))
    
    # Replace -inf, inf and NaN in WOE with 0
    summary_df['WOE'].replace([np.inf, -np.inf], 0, inplace=True)
    summary_df['WOE'].fillna(0, inplace=True)
    
    # Calculate Information Value (IV)
    summary_df['IV'] = (summary_df['Percent of Column Frequency (Goods)'] - summary_df['Percent of Column Frequency (Bads)']) * summary_df['WOE'] 
    
    # Fillna with 0 in IV
    summary_df['IV'].fillna(0, inplace=True)
    
    # Add the 'grpchar' column as a copy of the input column
    summary_df['grpchar'] = summary_df[column_name]
    
    # Reset index to avoid ambiguity
    summary_df.reset_index(drop=True, inplace=True)
    
    # Sort the DataFrame based on the specified column in ascending order
    summary_df.sort_values(by=column_name, inplace=True)
    
    # Save the summary DataFrame to an Excel file
    summary_df.to_excel(output_file, index=False)
  • Calculate the IVs after creating the optimally grouped variables
In [3]:
def calculate_iv(data, target_variable):
    """
    Calculate the Information Value (IV) for all variables in the dataset that end with '_segm'.
    
    Parameters:
    data (pd.DataFrame): The dataset containing the segmented variables.
    target_variable (pd.Series): The target variable where 1 indicates default and 0 indicates non-default.
    
    Returns:
    pd.DataFrame: A DataFrame containing the IV values for each segmented variable.
    """
    iv_list = []
    segmented_columns = [col for col in data.columns if col.endswith('_segm')]
    
    for column in segmented_columns:
        # Create a cross-tabulation of the segmented variable with the target variable
        cross_tab = pd.crosstab(data[column], target_variable)
        
        # Calculate the distribution of goods and bads
        cross_tab.columns = ['non_default', 'default']
        cross_tab['total'] = cross_tab['non_default'] + cross_tab['default']
        cross_tab['non_default_dist'] = cross_tab['non_default'] / cross_tab['non_default'].sum()
        cross_tab['default_dist'] = cross_tab['default'] / cross_tab['default'].sum()
        
        # Calculate WoE and IV
        cross_tab['woe'] = np.log(cross_tab['non_default_dist'] / cross_tab['default_dist'])
        cross_tab['iv'] = (cross_tab['non_default_dist'] - cross_tab['default_dist']) * cross_tab['woe']*100
        
        # Sum the IV values to get the IV for the variable
        iv_value = cross_tab['iv'].sum()
        
        iv_list.append({'Variable': column, 'IV': iv_value})
    
    # Create a DataFrame from the IV list
    iv_df = pd.DataFrame(iv_list).sort_values(by='IV', ascending=False)
    
    return iv_df
  • For Logistic Regressio, Random Forest & GradientBoosting:

Function that calculates KS, AUC & Gini -on an already fitted model

In [4]:
def calculate_DiscriminatoryStats(X, y, model, dataset_name):
    # Getting the predicted probabilities
    pred_prob_y = model.predict_proba(X)[:, 1]

    # probability of being good
    pred_prob_y_good = 1 - pred_prob_y

    # create the dataframe with rounded probabilities and calculate CreditScore
    df = pd.DataFrame({'Prob_Good': np.round(pred_prob_y_good, 2), 'y': y})
    df['Credit Score'] = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
    
    # Find the minimum positive credit score
    min_positive_credit_score = df[df['Credit Score'] > 0]['Credit Score'].min()
    
    # Apply the cap to the credit scores
    df['Credit Score'] = df['Credit Score'].apply(lambda x: min_positive_credit_score if x <= 0 else x)

    # Calculate the number of applicants per credit score
    df_summary = df.groupby('Credit Score').size().reset_index(name='num_applicants')

    # Calculate the counts of goods and bads from ground truth
    good_counts = df[df['y'] == 0].groupby('Credit Score').size().reset_index(name='num_goods')
    bad_counts = df[df['y'] == 1].groupby('Credit Score').size().reset_index(name='num_bads')
    df_summary = df_summary.merge(good_counts, on='Credit Score', how='left').merge(bad_counts, on='Credit Score', how='left').fillna(0)

    # Add the total column
    df_summary['total'] = df_summary['num_goods'] + df_summary['num_bads']

    # Calculate the cumulative frequencies of goods and bads
    df_summary['cum_freq_goods'] = df_summary['num_goods'].cumsum()
    df_summary['cum_freq_bads'] = df_summary['num_bads'].cumsum()

    # Calculate the % of total goods and bads
    total_goods = df_summary['num_goods'].sum()
    total_bads = df_summary['num_bads'].sum()
    df_summary['perc_total_goods'] = (df_summary['num_goods'] / total_goods * 100).round(2)
    df_summary['perc_total_bads'] = (df_summary['num_bads'] / total_bads * 100).round(2)

    # Calculate the cumulative percent of goods and bads
    df_summary['cum_perc_goods'] = (df_summary['cum_freq_goods'] / total_goods * 100).round(2)
    df_summary['cum_perc_bads'] = (df_summary['cum_freq_bads'] / total_bads * 100).round(2)

    # Calculate the difference between the cumulative distributions
    df_summary['Separation'] = (df_summary['cum_perc_goods'] - df_summary['cum_perc_bads']).round(2)

    # Calculate the Kolmogorov-Smirnov statistic
    ks_statistic = df_summary['Separation'].abs().max()

    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y, pred_prob_y)
    auc_metric = auc(fpr, tpr)

    # Calculate Gini coefficient
    gini_metric = 2 * auc_metric - 1

    # Display the final dataframe
    print(df_summary)

    # Plotting the cumulative percentages of goods and bads
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))

    ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_goods'], label='Cumulative % of Goods', marker='o')
    ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_bads'], label='Cumulative % of Bads', marker='o')
    ax[0].set_xlabel('Credit Score')
    ax[0].set_ylabel('Cumulative Percentage')
    ax[0].set_title('Cumulative Percentage of Goods and Bads')
    ax[0].legend()
    ax[0].grid(True)

    # Plotting the ROC curve
    ax[1].plot(fpr, tpr, label=f'ROC curve (area = {auc_metric:.2f})')
    ax[1].plot([0, 1], [0, 1], 'k--', label='45 degree line')
    ax[1].set_xlabel('False Positive Rate')
    ax[1].set_ylabel('True Positive Rate')
    ax[1].set_title('Receiver Operating Characteristic (ROC) Curve')
    ax[1].legend()
    ax[1].grid(True)

    plt.show()

    # Print the KS statistic, AUC metric, and Gini metric
    print(f'The Kolmogorov-Smirnov statistic on the {dataset_name} data is: {ks_statistic:.2f}')
    print(f'AUC metric on the {dataset_name} data is: {auc_metric:.2f}')
    print(f'Gini metric on the {dataset_name} data is: {gini_metric:.2f}')
  • For Logistic Regressio, Random Forest & GradientBoosting:

Function that will be used for the Population Stability Index caclulation

(comparison of the Scoring distribution between the training and test data)

In [5]:
def calculate_credit_scores(X, model):
    pred_prob_y = model.predict_proba(X)[:, 1]
    pred_prob_y_good = 1 - pred_prob_y
    credit_scores = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
    return credit_scores

def create_scoring_pools(train_scores):
    percentiles = np.percentile(train_scores, [10, 20, 30, 40, 50, 60, 70, 80, 90])
    bins = [-np.inf] + list(percentiles) + [np.inf]
    
    labels = []
    for i in range(len(bins)-1):
        if bins[i] == -np.inf:
            labels.append(f'<={int(bins[i+1])}')
        elif bins[i+1] == np.inf:
            labels.append(f'>{int(bins[i])}')
        else:
            labels.append(f'{int(bins[i]+1)} - {int(bins[i+1])}')
    
    scoring_pools = pd.cut(train_scores, bins=bins, labels=labels)
    return scoring_pools, bins, labels

def calculate_psi(train_pools, test_scores, bins, labels):
    train_dist = train_pools.value_counts().sort_index() / len(train_pools)
    test_pools = pd.cut(test_scores, bins=bins, labels=labels)
    test_dist = test_pools.value_counts().sort_index() / len(test_pools)
    psi_values = (train_dist - test_dist) * np.log(train_dist / test_dist)
    psi_stat = psi_values.sum()
    return train_dist, test_dist, psi_stat

def plot_distribution(train_dist, test_dist, labels):
    df = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
    df.index = labels
    df.plot(kind='bar', figsize=(12, 6))
    plt.xlabel('Scoring Pools')
    plt.ylabel('Proportion')
    plt.title('Distribution of Scoring Pools in Train and Test Sets')
    plt.grid(True)
    plt.show()

def calculate_and_plot_psi(train_X, test_X, model, train_name, test_name):
    train_scores = calculate_credit_scores(train_X, model)
    test_scores = calculate_credit_scores(test_X, model)
    train_pools, bins, labels = create_scoring_pools(train_scores)
    train_dist, test_dist, psi_stat = calculate_psi(train_pools, test_scores, bins, labels)
    plot_distribution(train_dist, test_dist, labels)
    print(f'The PSI statistic between {train_name} and {test_name} sets is: {psi_stat:.3f}')
    if psi_stat < 0.1:
        print(f'No significant shift in the population (PSI = {psi_stat:.3f})')
    elif psi_stat < 0.25:
        print(f'Moderate shift in the population (PSI = {psi_stat:.3f})')
    else:
        print(f'Significant shift in the population (PSI = {psi_stat:.3f})')
  • For IV comparison of the final characteristics of the model
In [6]:
def calculate_iv_comparison(train_data, test_data, y_train, y_test):
    """
    Calculate and compare the Information Value (IV) for all variables in the training and test datasets that end with '_segm'.
    
    Parameters:
    train_data (pd.DataFrame): The training dataset containing the segmented variables.
    test_data (pd.DataFrame): The test dataset containing the segmented variables.
    y_train (pd.Series): The target variable for the training dataset where 1 indicates default and 0 indicates non-default.
    y_test (pd.Series): The target variable for the test dataset where 1 indicates default and 0 indicates non-default.
    
    Returns:
    pd.DataFrame: A DataFrame containing the IV values for each segmented variable in both datasets.
    """
    iv_list = []
    segmented_columns = [col for col in train_data.columns]
    
    for column in segmented_columns:
        # Training data
        cross_tab_train = pd.crosstab(train_data[column], y_train)
        cross_tab_train.columns = ['non_default', 'default']
        cross_tab_train['total'] = cross_tab_train['non_default'] + cross_tab_train['default']
        cross_tab_train['non_default_dist'] = cross_tab_train['non_default'] / cross_tab_train['non_default'].sum()
        cross_tab_train['default_dist'] = cross_tab_train['default'] / cross_tab_train['default'].sum()
        cross_tab_train['woe'] = np.log(cross_tab_train['non_default_dist'] / cross_tab_train['default_dist'])
        cross_tab_train['iv'] = (cross_tab_train['non_default_dist'] - cross_tab_train['default_dist']) * cross_tab_train['woe']*100
        iv_train = np.round(cross_tab_train['iv'].sum(),2)
        
        # Test data
        cross_tab_test = pd.crosstab(test_data[column], y_test)
        cross_tab_test.columns = ['non_default', 'default']
        cross_tab_test['total'] = cross_tab_test['non_default'] + cross_tab_test['default']
        cross_tab_test['non_default_dist'] = cross_tab_test['non_default'] / cross_tab_test['non_default'].sum()
        cross_tab_test['default_dist'] = cross_tab_test['default'] / cross_tab_test['default'].sum()
        cross_tab_test['woe'] = np.log(cross_tab_test['non_default_dist'] / cross_tab_test['default_dist'])
        cross_tab_test['iv'] = (cross_tab_test['non_default_dist'] - cross_tab_test['default_dist']) * cross_tab_test['woe']*100
        iv_test = np.round(cross_tab_test['iv'].sum(), 2)
        
        iv_list.append({
            'Variable': column, 
            'IV_Train': iv_train, 
            'IV_Test': iv_test
        })
    
    # Create a DataFrame from the IV list
    iv_df = pd.DataFrame(iv_list).sort_values(by='IV_Train', ascending=False)
    
    return iv_df
  • Funcions that will be used only for the NN model:

KS/AUC/Gini

PSI

In [7]:
def calculate_DiscriminatoryStats_nn(X, y, model, dataset_name):
    # Getting the predicted probabilities
    pred_prob_y = model.predict(X).flatten()

    # probability of being good
    pred_prob_y_good = 1 - pred_prob_y

    # create the dataframe with rounded probabilities and calculate CreditScore
    df = pd.DataFrame({'Prob_Good': np.round(pred_prob_y_good, 2), 'y': y})
    df['Credit Score'] = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
    
    # Find the minimum positive credit score
    min_positive_credit_score = df[df['Credit Score'] > 0]['Credit Score'].min()
    
    # Apply the cap to the credit scores
    df['Credit Score'] = df['Credit Score'].apply(lambda x: min_positive_credit_score if x <= 0 else x)

    # Calculate the number of applicants per credit score
    df_summary = df.groupby('Credit Score').size().reset_index(name='num_applicants')

    # Calculate the counts of goods and bads from ground truth
    good_counts = df[df['y'] == 0].groupby('Credit Score').size().reset_index(name='num_goods')
    bad_counts = df[df['y'] == 1].groupby('Credit Score').size().reset_index(name='num_bads')
    df_summary = df_summary.merge(good_counts, on='Credit Score', how='left').merge(bad_counts, on='Credit Score', how='left').fillna(0)

    # Add the total column
    df_summary['total'] = df_summary['num_goods'] + df_summary['num_bads']

    # Calculate the cumulative frequencies of goods and bads
    df_summary['cum_freq_goods'] = df_summary['num_goods'].cumsum()
    df_summary['cum_freq_bads'] = df_summary['num_bads'].cumsum()

    # Calculate the % of total goods and bads
    total_goods = df_summary['num_goods'].sum()
    total_bads = df_summary['num_bads'].sum()
    df_summary['perc_total_goods'] = (df_summary['num_goods'] / total_goods * 100).round(2)
    df_summary['perc_total_bads'] = (df_summary['num_bads'] / total_bads * 100).round(2)

    # Calculate the cumulative percent of goods and bads
    df_summary['cum_perc_goods'] = (df_summary['cum_freq_goods'] / total_goods * 100).round(2)
    df_summary['cum_perc_bads'] = (df_summary['cum_freq_bads'] / total_bads * 100).round(2)

    # Calculate the difference between the cumulative distributions
    df_summary['Separation'] = (df_summary['cum_perc_goods'] - df_summary['cum_perc_bads']).round(2)

    # Calculate the Kolmogorov-Smirnov statistic
    ks_statistic = df_summary['Separation'].abs().max()

    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y, pred_prob_y)
    auc_metric = auc(fpr, tpr)

    # Calculate Gini coefficient
    gini_metric = 2 * auc_metric - 1

    # Display the final dataframe
    print(df_summary)

    # Plotting the cumulative percentages of goods and bads
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))

    ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_goods'], label='Cumulative % of Goods', marker='o')
    ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_bads'], label='Cumulative % of Bads', marker='o')
    ax[0].set_xlabel('Credit Score')
    ax[0].set_ylabel('Cumulative Percentage')
    ax[0].set_title('Cumulative Percentage of Goods and Bads')
    ax[0].legend()
    ax[0].grid(True)

    # Plotting the ROC curve
    ax[1].plot(fpr, tpr, label=f'ROC curve (area = {auc_metric:.2f})')
    ax[1].plot([0, 1], [0, 1], 'k--', label='45 degree line')
    ax[1].set_xlabel('False Positive Rate')
    ax[1].set_ylabel('True Positive Rate')
    ax[1].set_title('Receiver Operating Characteristic (ROC) Curve')
    ax[1].legend()
    ax[1].grid(True)

    plt.show()

    # Print the KS statistic, AUC metric, and Gini metric
    print(f'The Kolmogorov-Smirnov statistic on the {dataset_name} data is: {ks_statistic:.2f}')
    print(f'AUC metric on the {dataset_name} data is: {auc_metric:.2f}')
    print(f'Gini metric on the {dataset_name} data is: {gini_metric:.2f}')
In [8]:
def calculate_credit_scores_nn(X, model):
    # Get predicted probabilities using model.predict
    pred_prob_y = model.predict(X).flatten()
    pred_prob_y_good = 1 - pred_prob_y
    credit_scores = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
    return credit_scores

def create_scoring_pools_nn(train_scores):
    percentiles = np.percentile(train_scores, [10, 20, 30, 40, 50, 60, 70, 80, 90])
    bins = [-np.inf] + list(percentiles) + [np.inf]
    
    labels = []
    for i in range(len(bins)-1):
        if bins[i] == -np.inf:
            labels.append(f'<={int(bins[i+1])}')
        elif bins[i+1] == np.inf:
            labels.append(f'>{int(bins[i])}')
        else:
            labels.append(f'{int(bins[i]+1)} - {int(bins[i+1])}')
    
    scoring_pools = pd.cut(train_scores, bins=bins, labels=labels)
    return scoring_pools, bins, labels

def calculate_psi_nn(train_pools, test_scores, bins, labels):
    train_dist = train_pools.value_counts().sort_index() / len(train_pools)
    test_pools = pd.cut(test_scores, bins=bins, labels=labels)
    test_dist = test_pools.value_counts().sort_index() / len(test_pools)
    psi_values = (train_dist - test_dist) * np.log(train_dist / test_dist)
    psi_stat = psi_values.sum()
    return train_dist, test_dist, psi_stat

def plot_distribution_nn(train_dist, test_dist, labels):
    df = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
    df.index = labels
    df.plot(kind='bar', figsize=(12, 6))
    plt.xlabel('Scoring Pools')
    plt.ylabel('Proportion')
    plt.title('Distribution of Scoring Pools in Train and Test Sets')
    plt.grid(True)
    plt.show()

def calculate_and_plot_psi_nn(train_X, test_X, model, train_name, test_name):
    train_scores = calculate_credit_scores_nn(train_X, model)
    test_scores = calculate_credit_scores_nn(test_X, model)
    train_pools, bins, labels = create_scoring_pools_nn(train_scores)
    train_dist, test_dist, psi_stat = calculate_psi_nn(train_pools, test_scores, bins, labels)
    plot_distribution_nn(train_dist, test_dist, labels)
    print(f'The PSI statistic between {train_name} and {test_name} sets is: {psi_stat:.3f}')
    if psi_stat < 0.1:
        print(f'No significant shift in the population (PSI = {psi_stat:.3f})')
    elif psi_stat < 0.25:
        print(f'Moderate shift in the population (PSI = {psi_stat:.3f})')
    else:
        print(f'Significant shift in the population (PSI = {psi_stat:.3f})')

"Loan_default prediction" - dataset by LendingClub.com¶

  • data: https://www.kaggle.com/datasets/wordsforthewise/lending-club
  • dictionary: https://github.com/dosei1/Lending-Club-Loan-Data/blob/master/LCDataDictionary.csv
In [9]:
# Import the data
dataset = pd.read_csv("D:/Thesis data/accepted_2007_to_2018Q4.csv")


# Display the first few rows
print(dataset.shape)

dataset.head()
(2260701, 151)
Out[9]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit revol_bal_joint sec_app_fico_range_low sec_app_fico_range_high sec_app_earliest_cr_line sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog hardship_flag hardship_type hardship_reason hardship_status deferral_term hardship_amount hardship_start_date hardship_end_date payment_plan_start_date hardship_length hardship_dpd hardship_loan_status orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
0 68407277 NaN 3600.0 3600.0 3600.0 36 months 13.99 123.03 C C4 leadman 10+ years MORTGAGE 55000.0 Not Verified Dec-2015 Fully Paid n https://lendingclub.com/browse/loanDetail.acti... NaN debt_consolidation Debt consolidation 190xx PA 5.91 0.0 Aug-2003 675.0 679.0 1.0 30.0 NaN 7.0 0.0 2765.0 29.7 13.0 w 0.00 0.00 4421.723917 4421.72 3600.00 821.72 0.0 0.0 0.0 Jan-2019 122.67 NaN Mar-2019 564.0 560.0 0.0 30.0 1.0 Individual NaN NaN NaN 0.0 722.0 144904.0 2.0 2.0 0.0 1.0 21.0 4981.0 36.0 3.0 3.0 722.0 34.0 9300.0 3.0 1.0 4.0 4.0 20701.0 1506.0 37.2 0.0 0.0 148.0 128.0 3.0 3.0 1.0 4.0 69.0 4.0 69.0 2.0 2.0 4.0 2.0 5.0 3.0 4.0 9.0 4.0 7.0 0.0 0.0 0.0 3.0 76.9 0.0 0.0 0.0 178050.0 7746.0 2400.0 13734.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
1 68355089 NaN 24700.0 24700.0 24700.0 36 months 11.99 820.28 C C1 Engineer 10+ years MORTGAGE 65000.0 Not Verified Dec-2015 Fully Paid n https://lendingclub.com/browse/loanDetail.acti... NaN small_business Business 577xx SD 16.06 1.0 Dec-1999 715.0 719.0 4.0 6.0 NaN 22.0 0.0 21470.0 19.2 38.0 w 0.00 0.00 25679.660000 25679.66 24700.00 979.66 0.0 0.0 0.0 Jun-2016 926.35 NaN Mar-2019 699.0 695.0 0.0 NaN 1.0 Individual NaN NaN NaN 0.0 0.0 204396.0 1.0 1.0 0.0 1.0 19.0 18005.0 73.0 2.0 3.0 6472.0 29.0 111800.0 0.0 0.0 6.0 4.0 9733.0 57830.0 27.1 0.0 0.0 113.0 192.0 2.0 2.0 4.0 2.0 NaN 0.0 6.0 0.0 5.0 5.0 13.0 17.0 6.0 20.0 27.0 5.0 22.0 0.0 0.0 0.0 2.0 97.4 7.7 0.0 0.0 314017.0 39475.0 79300.0 24667.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
2 68341763 NaN 20000.0 20000.0 20000.0 60 months 10.78 432.66 B B4 truck driver 10+ years MORTGAGE 63000.0 Not Verified Dec-2015 Fully Paid n https://lendingclub.com/browse/loanDetail.acti... NaN home_improvement NaN 605xx IL 10.78 0.0 Aug-2000 695.0 699.0 0.0 NaN NaN 6.0 0.0 7869.0 56.2 18.0 w 0.00 0.00 22705.924294 22705.92 20000.00 2705.92 0.0 0.0 0.0 Jun-2017 15813.30 NaN Mar-2019 704.0 700.0 0.0 NaN 1.0 Joint App 71000.0 13.85 Not Verified 0.0 0.0 189699.0 0.0 1.0 0.0 4.0 19.0 10827.0 73.0 0.0 2.0 2081.0 65.0 14000.0 2.0 5.0 1.0 6.0 31617.0 2737.0 55.9 0.0 0.0 125.0 184.0 14.0 14.0 5.0 101.0 NaN 10.0 NaN 0.0 2.0 3.0 2.0 4.0 6.0 4.0 7.0 3.0 6.0 0.0 0.0 0.0 0.0 100.0 50.0 0.0 0.0 218418.0 18696.0 6200.0 14877.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
3 66310712 NaN 35000.0 35000.0 35000.0 60 months 14.85 829.90 C C5 Information Systems Officer 10+ years MORTGAGE 110000.0 Source Verified Dec-2015 Current n https://lendingclub.com/browse/loanDetail.acti... NaN debt_consolidation Debt consolidation 076xx NJ 17.06 0.0 Sep-2008 785.0 789.0 0.0 NaN NaN 13.0 0.0 7802.0 11.6 17.0 w 15897.65 15897.65 31464.010000 31464.01 19102.35 12361.66 0.0 0.0 0.0 Feb-2019 829.90 Apr-2019 Mar-2019 679.0 675.0 0.0 NaN 1.0 Individual NaN NaN NaN 0.0 0.0 301500.0 1.0 1.0 0.0 1.0 23.0 12609.0 70.0 1.0 1.0 6987.0 45.0 67300.0 0.0 1.0 0.0 2.0 23192.0 54962.0 12.1 0.0 0.0 36.0 87.0 2.0 2.0 1.0 2.0 NaN NaN NaN 0.0 4.0 5.0 8.0 10.0 2.0 10.0 13.0 5.0 13.0 0.0 0.0 0.0 1.0 100.0 0.0 0.0 0.0 381215.0 52226.0 62500.0 18000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
4 68476807 NaN 10400.0 10400.0 10400.0 60 months 22.45 289.91 F F1 Contract Specialist 3 years MORTGAGE 104433.0 Source Verified Dec-2015 Fully Paid n https://lendingclub.com/browse/loanDetail.acti... NaN major_purchase Major purchase 174xx PA 25.37 1.0 Jun-1998 695.0 699.0 3.0 12.0 NaN 12.0 0.0 21929.0 64.5 35.0 w 0.00 0.00 11740.500000 11740.50 10400.00 1340.50 0.0 0.0 0.0 Jul-2016 10128.96 NaN Mar-2018 704.0 700.0 0.0 NaN 1.0 Individual NaN NaN NaN 0.0 0.0 331730.0 1.0 3.0 0.0 3.0 14.0 73839.0 84.0 4.0 7.0 9702.0 78.0 34000.0 2.0 1.0 3.0 10.0 27644.0 4567.0 77.5 0.0 0.0 128.0 210.0 4.0 4.0 6.0 4.0 12.0 1.0 12.0 0.0 4.0 6.0 5.0 9.0 10.0 7.0 19.0 6.0 12.0 0.0 0.0 0.0 4.0 96.6 60.0 0.0 0.0 439570.0 95768.0 20300.0 88097.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
  • The dataset is quite large -and surely big enough for the purposes of our research analyses (comparison of Statistical Machine Learning methods for PD estimation of Loan Applications). We will keep only the approved applications of the 1st half of 2018 (i.e. up to June), since we want to make predictions based on the most up-to-date client profiles. No reject inference (i.e. fuzzy augmentation) will be used.
In [10]:
dataset = pd.read_csv("D:/Thesis data/accepted_2007_to_2018Q4.csv", parse_dates=['issue_d'], infer_datetime_format=True)

dataset = dataset[(dataset.issue_d >= '2018-01-01 00:00:00') & (dataset.issue_d < '2018-07-01 00:00:00')]
dataset = dataset.reset_index(drop=True)
dataset.head()
Out[10]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit revol_bal_joint sec_app_fico_range_low sec_app_fico_range_high sec_app_earliest_cr_line sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog hardship_flag hardship_type hardship_reason hardship_status deferral_term hardship_amount hardship_start_date hardship_end_date payment_plan_start_date hardship_length hardship_dpd hardship_loan_status orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
0 130954621 NaN 5000.0 5000.0 5000.0 36 months 20.39 186.82 D D4 General Manager 8 years RENT 50000.0 Verified 2018-03-01 Current n https://lendingclub.com/browse/loanDetail.acti... NaN other Other 740xx OK 21.80 1.0 Jan-2009 665.0 669.0 0.0 9.0 NaN 5.0 0.0 116.0 23.2 18.0 w 3780.31 3780.31 2043.690000 2043.69 1219.69 824.00 0.0 0.0 0.0 Mar-2019 186.82 Apr-2019 Mar-2019 609.0 605.0 0.0 9.0 1.0 Individual NaN NaN NaN 0.0 0.0 19344.0 0.0 2.0 0.0 1.0 16.0 14118.0 51.0 1.0 2.0 85.0 58.0 500.0 9.0 0.0 5.0 3.0 3869.0 384.0 23.2 1.0 0.0 80.0 13.0 11.0 11.0 0.0 11.0 NaN 2.0 NaN 4.0 2.0 2.0 2.0 2.0 15.0 2.0 2.0 2.0 5.0 0.0 0.0 1.0 1.0 77.8 0.0 0.0 0.0 33430.0 19344.0 500.0 27820.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
1 130964697 NaN 15000.0 15000.0 15000.0 36 months 9.92 483.45 B B2 IT Director 2 years OWN 196000.0 Source Verified 2018-03-01 Current n https://lendingclub.com/browse/loanDetail.acti... NaN debt_consolidation Debt consolidation 337xx FL 18.29 0.0 Jul-1998 700.0 704.0 0.0 65.0 NaN 19.0 0.0 24243.0 46.3 53.0 w 10878.50 10878.50 5301.420000 5301.42 4121.50 1179.92 0.0 0.0 0.0 Feb-2019 483.45 Apr-2019 Mar-2019 694.0 690.0 0.0 NaN 1.0 Individual NaN NaN NaN 0.0 0.0 534954.0 4.0 3.0 2.0 2.0 6.0 113470.0 59.0 4.0 12.0 10495.0 51.0 52400.0 4.0 1.0 7.0 15.0 31468.0 7368.0 74.1 0.0 0.0 141.0 236.0 4.0 4.0 5.0 11.0 NaN 6.0 NaN 0.0 4.0 10.0 5.0 16.0 11.0 14.0 37.0 10.0 19.0 0.0 0.0 0.0 6.0 98.0 75.0 0.0 0.0 605228.0 137713.0 28500.0 147178.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
2 130955326 NaN 11200.0 11200.0 11200.0 60 months 30.79 367.82 G G1 Client services < 1 year RENT 44000.0 Not Verified 2018-03-01 Current n https://lendingclub.com/browse/loanDetail.acti... NaN medical Medical expenses 030xx NH 43.97 1.0 Jul-2007 665.0 669.0 2.0 6.0 NaN 8.0 0.0 1526.0 24.6 14.0 w 10193.73 10193.73 4007.700000 4007.70 1006.27 3001.43 0.0 0.0 0.0 Feb-2019 367.82 Apr-2019 Mar-2019 629.0 625.0 0.0 70.0 1.0 Joint App 81000.0 31.94 Not Verified 0.0 0.0 67173.0 1.0 4.0 1.0 4.0 8.0 65647.0 89.0 1.0 1.0 1011.0 84.0 6200.0 8.0 1.0 10.0 5.0 8397.0 632.0 66.7 0.0 0.0 124.0 128.0 5.0 5.0 0.0 34.0 35.0 0.0 35.0 1.0 2.0 3.0 2.0 3.0 8.0 4.0 6.0 3.0 8.0 0.0 0.0 0.0 2.0 71.4 0.0 0.0 0.0 80367.0 67173.0 1900.0 74167.0 7101.0 610.0 614.0 Feb-2005 3.0 1.0 14.0 80.0 11.0 8.0 0.0 2.0 37.0 N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
3 130504052 NaN 25000.0 25000.0 25000.0 60 months 21.85 688.35 D D5 Asphalt Supervisor 10+ years MORTGAGE 65000.0 Source Verified 2018-03-01 Current n https://lendingclub.com/browse/loanDetail.acti... NaN debt_consolidation Debt consolidation 361xx AL 12.89 1.0 Mar-1995 665.0 669.0 1.0 22.0 NaN 7.0 0.0 8657.0 98.4 16.0 w 22188.73 22188.73 7511.160000 7511.16 2811.27 4699.89 0.0 0.0 0.0 Feb-2019 688.35 Apr-2019 Mar-2019 669.0 665.0 0.0 23.0 1.0 Individual NaN NaN NaN 0.0 0.0 74795.0 0.0 2.0 0.0 2.0 16.0 8382.0 82.0 0.0 0.0 3237.0 90.0 8800.0 4.0 3.0 3.0 2.0 10685.0 63.0 98.1 0.0 0.0 69.0 126.0 72.0 16.0 2.0 126.0 NaN 0.0 22.0 2.0 1.0 3.0 1.0 1.0 4.0 3.0 9.0 3.0 7.0 0.0 0.0 1.0 0.0 75.0 100.0 0.0 0.0 101234.0 17039.0 3300.0 10220.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
4 130956066 NaN 3000.0 3000.0 3000.0 36 months 7.34 93.10 A A4 Scale Technician 9 years RENT 52000.0 Source Verified 2018-03-01 Fully Paid n https://lendingclub.com/browse/loanDetail.acti... NaN major_purchase Major purchase 988xx WA 0.58 0.0 Jan-1998 760.0 764.0 0.0 26.0 NaN 7.0 0.0 141.0 0.5 30.0 w 0.00 0.00 3011.577285 3011.58 3000.00 11.58 0.0 0.0 0.0 May-2018 614.03 NaN Nov-2018 764.0 760.0 0.0 NaN 1.0 Individual NaN NaN NaN 0.0 0.0 150592.0 0.0 0.0 1.0 2.0 7.0 0.0 NaN 0.0 1.0 141.0 1.0 31000.0 1.0 2.0 2.0 3.0 25099.0 30359.0 0.5 0.0 0.0 132.0 242.0 18.0 7.0 4.0 18.0 NaN 7.0 NaN 0.0 1.0 1.0 4.0 15.0 7.0 6.0 19.0 1.0 7.0 0.0 0.0 0.0 1.0 96.7 0.0 0.0 0.0 191216.0 141.0 30500.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
In [11]:
dataset.shape
Out[11]:
(238636, 151)
In [12]:
dataset.dtypes
Out[12]:
id                        object
member_id                float64
loan_amnt                float64
funded_amnt              float64
funded_amnt_inv          float64
                          ...   
settlement_status         object
settlement_date           object
settlement_amount        float64
settlement_percentage    float64
settlement_term          float64
Length: 151, dtype: object

Default Definition & Exclusion¶

The mapping of Good-Bad applicants (i.e. the creation of the Target variable) is made upon using the descriptions as given by LendingClub.com:

  • Current: Loan is up to date on all outstanding payments.
  • In Grace Period: Loan is past due but within the 15-day grace period.
  • Late (16-30): Loan has not been current for 16 to 30 days.
  • Late (31-120): Loan has not been current for 31 to 120 days.
  • Fully paid: Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.
  • Default: Loan has not been current for an extended period of time.
  • Charged Off: Loan for which there is no longer a reasonable expectation of further payments.

(https://www.kaggle.com/code/pavlofesenko/minimizing-risks-for-loan-investments)

In [13]:
dataset['Defaulted'] = dataset['loan_status'].isin(['Charged Off', 'Late (16-30 days)', 'Late (31-120 days)']).astype('int')
dataset['Defaulted'].value_counts()
Out[13]:
Defaulted
0    225388
1     13248
Name: count, dtype: int64
  • The bad rate (i.e. Default Rate) of the selected period stands at 5.55%
In [14]:
#Total Bad Rate for the approved applications of 2018
dataset['Defaulted'].value_counts()/len(dataset['Defaulted'])
Out[14]:
Defaulted
0    0.944484
1    0.055516
Name: count, dtype: float64

Hence:

  • 1 is assigned to Bad applicants
  • 0 is assigned to Good applicants

Pruning the dataset - Variables that will be dropped¶

Some variables are not available at the time of the loan request, such as the "Interest rate" of the loan (as it is determined by the financial institution based on the risk-level of the client). Additionally, some other variables such as the "Employment Title" & "URL" are categorical with too many values -and no meaningful insight could be obtained. Finally, some other variables will be dropped, since the information they contain is generally regarded of minimum importance -such as the "verification_status_joint", where the info exists for a very small percentage of the applications (since very few applications have co-applicants on the first place).

Furthermore, it is important to note that in credit risk, all the variables used on a model should have a clear and sound interpretation. Hence, the observed patterns between the Bad Rate and the values of each predictor should be interpretable.

  1. Columns that have both text and numerical values -that will also not provide useful insights

id

desc

next_pymnt_d

verification_status_joint

sec_app_earliest_cr_line

hardship_type

hardship_reason

hardship_status

hardship_start_date

hardship_end_date

payment_plan_start_date

hardship_loan_status

debt_settlement_flag_date

settlement_status

settlement_date

In [15]:
cols_to_be_dropped_a = [
"id",
"desc",
"next_pymnt_d",
"verification_status_joint",
"sec_app_earliest_cr_line",
"hardship_type",
"hardship_reason",
"hardship_status",
"hardship_start_date",
"hardship_end_date",
"payment_plan_start_date",
"hardship_loan_status",
"debt_settlement_flag_date",
"settlement_status",
"settlement_date",
]
  1. Columns that do not have description and are not self-expanatory or are not available at the time of application
  • For example, the variabe "hardship_flag" is probably a flag that corresponds to difficulties on repaying loan amoun, however it is probably updated as soon as the borrower decleares inavailability to repay the loan. Hence, it cannot be used as predictor, since its information probably comes retrospectively, as well as it will be highly correlated with the "Default" target variable and would introduce bias to the analysis. Same goes for the "out_prncp" characteristic Outstanding Principal Amount, since there is no info to validate that this field refers to the Outstanding Principal Amount -at the time of the application (again, if the information gets updated with the information of the ongoing approved loan, then bias will be introduced by incorporating it)
In [16]:
cols_to_be_dropped_b = [
"out_prncp",
"desc",
"initial_list_status",
"collection_recovery_fee",
"last_pymnt_d",
"last_pymnt_amnt",
"last_credit_pull_d",
"collections_12_mths_ex_med",
"mths_since_rcnt_il",
"max_bal_bc",
"all_util",
"inq_fi",
"bc_open_to_buy",
"bc_util",
"mo_sin_old_il_acct",
"mo_sin_old_rev_tl_op",
"mo_sin_rcnt_rev_tl_op",
"mo_sin_rcnt_tl",
"mths_since_recent_bc",
"num_actv_bc_tl",
"num_actv_rev_tl",
"num_bc_sats",
"num_bc_tl",
"num_il_tl",
"num_op_rev_tl",
"num_rev_tl_bal_gt_0",
"num_sats",
"num_tl_120dpd_2m",
"num_tl_30dpd",
"num_tl_90g_dpd_24m",
"num_tl_op_past_12m",
"pct_tl_nvr_dlq",
"percent_bc_gt_75",
"tax_liens",
"tot_hi_cred_lim",
"total_bc_limit",
"total_il_high_credit_limit",
"sec_app_fico_range_low",
"sec_app_fico_range_high",
"sec_app_earliest_cr_line",
"sec_app_inq_last_6mths",
"sec_app_mort_acc",
"sec_app_open_acc",
"sec_app_revol_util",
"sec_app_open_act_il",
"sec_app_num_rev_accts",
"sec_app_chargeoff_within_12_mths",
"sec_app_collections_12_mths_ex_med",
"sec_app_mths_since_last_major_derog",
"hardship_flag",
"hardship_type",
"hardship_reason",
"hardship_status",
"deferral_term",
"hardship_amount",
"hardship_start_date",
"hardship_end_date",
"payment_plan_start_date",
"hardship_length",
"hardship_dpd",
"hardship_loan_status",
"orig_projected_additional_accrued_interest",
"hardship_payoff_balance_amount",
"hardship_last_payment_amount",
"disbursement_method",
"debt_settlement_flag",
"debt_settlement_flag_date",
"settlement_status",
"settlement_date",
"settlement_amount",
"settlement_percentage",
"settlement_term",
]

The following variables may not have descreptions -but they are self-explanatory and can also provide us with very useful insights. Thus, we will proceed with analyzing these variables as well

annual_inc

addr_state

dti

delinq_2yrs

earliest_cr_line

fico_range_low

fico_range_high

last_fico_range_high

last_fico_range_low

mths_since_last_major_derog

application_type

annual_inc_joint

dti_joint

acc_now_delinq

il_util

inq_last_12m

acc_open_past_24mths

avg_cur_bal

chargeoff_within_12_mths

delinq_amnt

mort_acc

mths_since_recent_bc_dlq

mths_since_recent_inq

mths_since_recent_revol_delinq

num_accts_ever_120_pd

num_rev_accts

pub_rec_bankruptcies

total_bal_ex_mort

revol_bal_joint

Columns that have description but either do not have meaningful usage or will lead to selection bias

  • For example, the characteristic "funded_amnt_inv" (i.e. "The total amount committed by investors for that loan at that point in time"), will induce selection bias in the analysis, as investors' judgment is prune to errors and underestimation of the level of risk of each corresponding applicant.

Same goes for the "Grade" characteristic, as the model estimate should be based on credit loan, bureau ,demographic , transactional data, that are hihgly interpretable.

In [17]:
cols_to_be_dropped_c = [
"id",
"member_id",
"funded_amnt_inv",
"int_rate",
"grade",
"sub_grade",
"emp_title",
"pymnt_plan",
"url",
"title",
"out_prncp_inv",
"total_pymnt_inv",
"total_rec_int",
"total_rec_late_fee",
"next_pymnt_d",
]

Drop the unwanted variables

In [18]:
# Combine all columns to be dropped into a single list
cols_to_be_dropped = list(set(cols_to_be_dropped_a + cols_to_be_dropped_b + cols_to_be_dropped_c))

# Drop the columns from the dataset
dataset = dataset.drop(columns=cols_to_be_dropped)
In [19]:
# Print the resulting dataset to confirm the columns have been dropped
print(dataset.shape)
dataset.head()
(238636, 64)
Out[19]:
loan_amnt funded_amnt term installment emp_length home_ownership annual_inc verification_status issue_d loan_status purpose zip_code addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt total_rec_prncp recoveries last_fico_range_high last_fico_range_low mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m total_bal_il il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal chargeoff_within_12_mths delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_rev_accts pub_rec_bankruptcies total_bal_ex_mort revol_bal_joint Defaulted
0 5000.0 5000.0 36 months 186.82 8 years RENT 50000.0 Verified 2018-03-01 Current other 740xx OK 21.80 1.0 Jan-2009 665.0 669.0 0.0 9.0 NaN 5.0 0.0 116.0 23.2 18.0 2043.690000 1219.69 0.0 609.0 605.0 9.0 1.0 Individual NaN NaN 0.0 0.0 19344.0 0.0 2.0 0.0 1.0 14118.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 1.0 0.0 0.0 NaN 2.0 NaN 4.0 2.0 0.0 19344.0 NaN 0
1 15000.0 15000.0 36 months 483.45 2 years OWN 196000.0 Source Verified 2018-03-01 Current debt_consolidation 337xx FL 18.29 0.0 Jul-1998 700.0 704.0 0.0 65.0 NaN 19.0 0.0 24243.0 46.3 53.0 5301.420000 4121.50 0.0 694.0 690.0 NaN 1.0 Individual NaN NaN 0.0 0.0 534954.0 4.0 3.0 2.0 2.0 113470.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 0.0 5.0 NaN 6.0 NaN 0.0 37.0 0.0 137713.0 NaN 0
2 11200.0 11200.0 60 months 367.82 < 1 year RENT 44000.0 Not Verified 2018-03-01 Current medical 030xx NH 43.97 1.0 Jul-2007 665.0 669.0 2.0 6.0 NaN 8.0 0.0 1526.0 24.6 14.0 4007.700000 1006.27 0.0 629.0 625.0 70.0 1.0 Joint App 81000.0 31.94 0.0 0.0 67173.0 1.0 4.0 1.0 4.0 65647.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 0.0 35.0 0.0 35.0 1.0 6.0 0.0 67173.0 7101.0 0
3 25000.0 25000.0 60 months 688.35 10+ years MORTGAGE 65000.0 Source Verified 2018-03-01 Current debt_consolidation 361xx AL 12.89 1.0 Mar-1995 665.0 669.0 1.0 22.0 NaN 7.0 0.0 8657.0 98.4 16.0 7511.160000 2811.27 0.0 669.0 665.0 23.0 1.0 Individual NaN NaN 0.0 0.0 74795.0 0.0 2.0 0.0 2.0 8382.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 0.0 2.0 NaN 0.0 22.0 2.0 9.0 0.0 17039.0 NaN 0
4 3000.0 3000.0 36 months 93.10 9 years RENT 52000.0 Source Verified 2018-03-01 Fully Paid major_purchase 988xx WA 0.58 0.0 Jan-1998 760.0 764.0 0.0 26.0 NaN 7.0 0.0 141.0 0.5 30.0 3011.577285 3000.00 0.0 764.0 760.0 NaN 1.0 Individual NaN NaN 0.0 0.0 150592.0 0.0 0.0 1.0 2.0 0.0 NaN 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 0.0 4.0 NaN 7.0 NaN 0.0 19.0 0.0 141.0 NaN 0

Continuous Vs Discrete Vs Categorical variables

In [20]:
# Exclude  the target variable
exclude_columns = ['loan_status', 'Defaulted']

# For categorical variables 
categorical = [var for var in dataset.columns if dataset[var].dtype == 'O' 
                                                 and var not in exclude_columns]

# For discrete variables
discrete = [var for var in dataset.columns if dataset[var].dtype != 'O' 
                                                 and var not in exclude_columns
                                                 and dataset[var].nunique() < 20
           ]

# For continuous variables
continuous = [var for var in dataset.columns if dataset[var].dtype != 'O' 
                                                and var not in discrete
                                                and var not in exclude_columns
             ]

print('Categorical Variables: ', categorical)
Categorical Variables:  ['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'zip_code', 'addr_state', 'earliest_cr_line', 'application_type']
In [21]:
print('Discrete Variables: ', discrete)
Discrete Variables:  ['issue_d', 'inq_last_6mths', 'pub_rec', 'policy_code', 'acc_now_delinq', 'open_acc_6m', 'open_il_12m', 'chargeoff_within_12_mths', 'pub_rec_bankruptcies']
In [22]:
print('Continuous Variables: ', continuous)
Continuous Variables:  ['loan_amnt', 'funded_amnt', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'total_rec_prncp', 'recoveries', 'last_fico_range_high', 'last_fico_range_low', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_act_il', 'open_il_24m', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'total_rev_hi_lim', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'delinq_amnt', 'mort_acc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_rev_accts', 'total_bal_ex_mort', 'revol_bal_joint']
  • Number of unique values for each individual characteristic
In [23]:
num_feat = continuous
dataset[num_feat].nunique().sort_values()
Out[23]:
open_il_24m                           20
open_rv_12m                           23
delinq_2yrs                           23
mths_since_recent_inq                 26
mort_acc                              28
num_accts_ever_120_pd                 35
open_rv_24m                           36
fico_range_low                        38
fico_range_high                       38
inq_last_12m                          41
acc_open_past_24mths                  41
total_cu_tl                           42
open_act_il                           43
open_acc                              70
last_fico_range_low                   71
last_fico_range_high                  72
num_rev_accts                         89
delinq_amnt                          107
total_acc                            121
mths_since_last_record               127
mths_since_last_delinq               141
mths_since_recent_bc_dlq             147
mths_since_last_major_derog          148
mths_since_recent_revol_delinq       150
il_util                              212
revol_util                          1097
funded_amnt                         1547
loan_amnt                           1547
recoveries                          1963
dti_joint                           3808
tot_coll_amt                        5552
total_rev_hi_lim                    5751
annual_inc_joint                    6477
dti                                 7704
annual_inc                         18140
installment                        21130
revol_bal_joint                    27131
avg_cur_bal                        49650
revol_bal                          49801
total_rec_prncp                    75633
total_bal_il                       85610
total_bal_ex_mort                 104425
total_pymnt                       132825
tot_cur_bal                       159030
dtype: int64
In [24]:
num_feat = discrete
dataset[num_feat].nunique().sort_values()
Out[24]:
policy_code                  1
acc_now_delinq               2
issue_d                      6
inq_last_6mths               6
pub_rec_bankruptcies         7
open_il_12m                  8
chargeoff_within_12_mths     8
open_acc_6m                 15
pub_rec                     16
dtype: int64
In [25]:
num_feat = categorical
dataset[num_feat].nunique().sort_values()
Out[25]:
term                     2
application_type         2
verification_status      3
home_ownership           4
emp_length              11
purpose                 13
addr_state              50
earliest_cr_line       661
zip_code               886
dtype: int64
In [26]:
# Drop "policy_code", since it is a constant
dataset = dataset.drop(columns="policy_code")
dataset.shape
Out[26]:
(238636, 63)
In [27]:
# Also remove it from the list of discrete variables
discrete.remove('policy_code')
  • More Quality checks

Function for percentage calculation and cumulative percentages for each column

In [28]:
def freq(df):
    """
def freq(df):
   to obtain the number of unique values
     and their corresponding percentages for each column.
    
 Returns:
 dict: A dictionary where keys are column names and values are DataFrames
      containing the frequency and percentage of unique values.
    """
    result = {}
    
    for col in df.columns:
        value_counts = df[col].value_counts(dropna=False).sort_index()
        percentages = (value_counts / len(df)) * 100
        cumulative_percentages = percentages.cumsum()
        freq_df = pd.DataFrame({
            'Frequency': value_counts,
            'Percentage': percentages,
            'Cumulative Percentage': cumulative_percentages
        })
        result[col] = freq_df
        
    return result

# obtain the frequency and percentage of unique values for each column
freq_results = freq(dataset)
In [29]:
for col in categorical:
    print(f"Results for column: {col}")
    print(freq_results[col])
    print("\n")
Results for column: term
           Frequency  Percentage  Cumulative Percentage
term                                                   
36 months     166918   69.946697              69.946697
60 months      71718   30.053303             100.000000


Results for column: emp_length
            Frequency  Percentage  Cumulative Percentage
emp_length                                              
1 year          15805    6.623058               6.623058
10+ years       79521   33.323136              39.946194
2 years         22483    9.421462              49.367656
3 years         20051    8.402337              57.769993
4 years         15550    6.516200              64.286193
5 years         15014    6.291591              70.577784
6 years         10805    4.527816              75.105600
7 years          8958    3.753834              78.859434
8 years          7612    3.189795              82.049230
9 years          6703    2.808880              84.858110
< 1 year        16238    6.804506              91.662616
NaN             19896    8.337384             100.000000


Results for column: home_ownership
                Frequency  Percentage  Cumulative Percentage
home_ownership                                              
ANY                    18    0.007543               0.007543
MORTGAGE           113594   47.601368              47.608911
OWN                 30917   12.955715              60.564626
RENT                94107   39.435374             100.000000


Results for column: verification_status
                     Frequency  Percentage  Cumulative Percentage
verification_status                                              
Not Verified             91375   38.290535              38.290535
Source Verified          94744   39.702308              77.992843
Verified                 52517   22.007157             100.000000


Results for column: purpose
                    Frequency  Percentage  Cumulative Percentage
purpose                                                         
car                      2886    1.209373               1.209373
credit_card             57399   24.052951              25.262324
debt_consolidation     119224   49.960609              75.222934
home_improvement        17292    7.246182              82.469116
house                    3819    1.600345              84.069461
major_purchase           6980    2.924957              86.994418
medical                  3723    1.560117              88.554535
moving                   1677    0.702744              89.257279
other                   21043    8.818032              98.075311
renewable_energy          135    0.056572              98.131883
small_business           2589    1.084916              99.216799
vacation                 1864    0.781106              99.997905
wedding                     5    0.002095             100.000000


Results for column: zip_code
          Frequency  Percentage  Cumulative Percentage
zip_code                                              
010xx           372    0.155886               0.155886
011xx           147    0.061600               0.217486
012xx           100    0.041905               0.259391
013xx            72    0.030171               0.289562
014xx           183    0.076686               0.366248
...             ...         ...                    ...
995xx           278    0.116495              99.878895
996xx           136    0.056991              99.935886
997xx            92    0.038552              99.974438
998xx            47    0.019695              99.994133
999xx            14    0.005867             100.000000

[886 rows x 3 columns]


Results for column: addr_state
            Frequency  Percentage  Cumulative Percentage
addr_state                                              
AK                565    0.236762               0.236762
AL               2753    1.153640               1.390402
AR               1833    0.768115               2.158518
AZ               5842    2.448080               4.606597
CA              32004   13.411220              18.017818
CO               5447    2.282556              20.300374
CT               3802    1.593221              21.893595
DC                456    0.191086              22.084681
DE                664    0.278248              22.362929
FL              17809    7.462830              29.825760
GA               8001    3.352805              33.178565
HI                934    0.391391              33.569956
ID                797    0.333981              33.903937
IL               9415    3.945339              37.849277
IN               4174    1.749107              39.598384
KS               1843    0.772306              40.370690
KY               2256    0.945373              41.316063
LA               2536    1.062706              42.378769
MA               5452    2.284651              44.663420
MD               5793    2.427547              47.090967
ME                784    0.328534              47.419501
MI               6087    2.550747              49.970248
MN               3963    1.660688              51.630936
MO               3779    1.583583              53.214519
MS               1524    0.638630              53.853149
MT                688    0.288305              54.141454
NC               6799    2.849109              56.990563
ND                512    0.214553              57.205116
NE               1147    0.480648              57.685764
NH               1226    0.513753              58.199517
NJ               8363    3.504501              61.704018
NM               1220    0.511239              62.215257
NV               3658    1.532879              63.748135
NY              19126    8.014717              71.762852
OH               7778    3.259357              75.022210
OK               2249    0.942440              75.964649
OR               2917    1.222364              77.187013
PA               7646    3.204043              80.391056
RI               1089    0.456344              80.847399
SC               3096    1.297373              82.144773
SD                470    0.196953              82.341725
TN               3954    1.656917              83.998642
TX              19713    8.260698              92.259341
UT               1519    0.636534              92.895875
VA               6252    2.619890              95.515765
VT                590    0.247238              95.763003
WA               4973    2.083927              97.846930
WI               3182    1.333412              99.180342
WV               1476    0.618515              99.798857
WY                480    0.201143             100.000000


Results for column: earliest_cr_line
                  Frequency  Percentage  Cumulative Percentage
earliest_cr_line                                              
Apr-1955                  1    0.000419               0.000419
Apr-1963                  3    0.001257               0.001676
Apr-1964                  3    0.001257               0.002933
Apr-1965                 10    0.004190               0.007124
Apr-1966                  9    0.003771               0.010895
...                     ...         ...                    ...
Sep-2010                787    0.329791              98.978779
Sep-2011                798    0.334401              99.313180
Sep-2012                599    0.251010              99.564190
Sep-2013                553    0.231734              99.795923
Sep-2014                487    0.204077             100.000000

[661 rows x 3 columns]


Results for column: application_type
                  Frequency  Percentage  Cumulative Percentage
application_type                                              
Individual           204218   85.577197              85.577197
Joint App             34418   14.422803             100.000000


Zip code has no concentration and thus no solid predictions can be made. Thus, it will be dropped as well

In [30]:
# Drop "zip_code", since it has too many values and no concentration is identified in any specific value
dataset = dataset.drop(columns="zip_code")
# Also remove it from the list of discrete variables
categorical.remove('zip_code')
In [31]:
for col in categorical:
    print(f"Missing values for column: {col}")
    print(dataset[col].isnull().mean())
    print("\n")
Missing values for column: term
0.0


Missing values for column: emp_length
0.08337384133156775


Missing values for column: home_ownership
0.0


Missing values for column: verification_status
0.0


Missing values for column: purpose
0.0


Missing values for column: addr_state
0.0


Missing values for column: earliest_cr_line
0.0


Missing values for column: application_type
0.0


In [32]:
for col in discrete:
    print(f"Results for column: {col}")
    print(freq_results[col])
    print("\n")
Results for column: issue_d
            Frequency  Percentage  Cumulative Percentage
issue_d                                                 
2018-01-01      36347   15.231147              15.231147
2018-02-01      32746   13.722154              28.953301
2018-03-01      38771   16.246920              45.200221
2018-04-01      42928   17.988904              63.189125
2018-05-01      46311   19.406544              82.595669
2018-06-01      41533   17.404331             100.000000


Results for column: inq_last_6mths
                Frequency  Percentage  Cumulative Percentage
inq_last_6mths                                              
0.0                158875   66.576292              66.576292
1.0                 56996   23.884074              90.460366
2.0                 17353    7.271744              97.732111
3.0                  5157    2.161032              99.893143
4.0                   188    0.078781              99.971924
5.0                    67    0.028076             100.000000


Results for column: pub_rec
         Frequency  Percentage  Cumulative Percentage
pub_rec                                              
0.0         206809   86.662951              86.662951
1.0          29932   12.542952              99.205904
2.0           1239    0.519201              99.725104
3.0            368    0.154210              99.879314
4.0            154    0.064533              99.943848
5.0             66    0.027657              99.971505
6.0             29    0.012152              99.983657
7.0             17    0.007124              99.990781
8.0              7    0.002933              99.993714
9.0              6    0.002514              99.996229
10.0             2    0.000838              99.997067
13.0             3    0.001257              99.998324
15.0             1    0.000419              99.998743
19.0             1    0.000419              99.999162
24.0             1    0.000419              99.999581
52.0             1    0.000419             100.000000


Results for column: acc_now_delinq
                Frequency  Percentage  Cumulative Percentage
acc_now_delinq                                              
0.0                238610   99.989105              99.989105
1.0                    26    0.010895             100.000000


Results for column: open_acc_6m
             Frequency  Percentage  Cumulative Percentage
open_acc_6m                                              
0.0             112222   47.026434              47.026434
1.0              72299   30.296770              77.323204
2.0              33227   13.923716              91.246920
3.0              13316    5.580047              96.826967
4.0               4827    2.022746              98.849713
5.0               1741    0.729563              99.579276
6.0                586    0.245562              99.824838
7.0                246    0.103086              99.927924
8.0                112    0.046933              99.974857
9.0                 36    0.015086              99.989943
10.0                11    0.004610              99.994552
11.0                 6    0.002514              99.997067
12.0                 4    0.001676              99.998743
13.0                 2    0.000838              99.999581
15.0                 1    0.000419             100.000000


Results for column: open_il_12m
             Frequency  Percentage  Cumulative Percentage
open_il_12m                                              
0.0             132357   55.463970              55.463970
1.0              69795   29.247473              84.711443
2.0              25129   10.530264              95.241707
3.0               7771    3.256424              98.498131
4.0               2467    1.033792              99.531923
5.0                809    0.339010              99.870933
6.0                307    0.128648              99.999581
8.0                  1    0.000419             100.000000


Results for column: chargeoff_within_12_mths
                          Frequency  Percentage  Cumulative Percentage
chargeoff_within_12_mths                                              
0.0                          237159   99.381066              99.381066
1.0                            1374    0.575772              99.956838
2.0                              81    0.033943              99.990781
3.0                              13    0.005448              99.996229
4.0                               6    0.002514              99.998743
6.0                               1    0.000419              99.999162
7.0                               1    0.000419              99.999581
9.0                               1    0.000419             100.000000


Results for column: pub_rec_bankruptcies
                      Frequency  Percentage  Cumulative Percentage
pub_rec_bankruptcies                                              
0.0                      209476   87.780553              87.780553
1.0                       28793   12.065656              99.846209
2.0                         314    0.131581              99.977790
3.0                          40    0.016762              99.994552
4.0                          10    0.004190              99.998743
5.0                           2    0.000838              99.999581
7.0                           1    0.000419             100.000000


In [33]:
# Drop "acc_now_delinq", "chargeoff_within_12_mths" are quasi-constants, will be dropped as well
dataset = dataset.drop(columns=["acc_now_delinq","chargeoff_within_12_mths"])
# Also remove it from the list of discrete variables
discrete.remove('acc_now_delinq')
discrete.remove('chargeoff_within_12_mths')
In [34]:
for col in discrete:
    print(f"Missing values for column: {col}")
    print(dataset[col].isnull().mean())
    print("\n")
Missing values for column: issue_d
0.0


Missing values for column: inq_last_6mths
0.0


Missing values for column: pub_rec
0.0


Missing values for column: open_acc_6m
0.0


Missing values for column: open_il_12m
0.0


Missing values for column: pub_rec_bankruptcies
0.0


In [35]:
for col in continuous:
    print(f"Missing values for column: {col}")
    print(dataset[col].isnull().mean())
    print("\n")
Missing values for column: loan_amnt
0.0


Missing values for column: funded_amnt
0.0


Missing values for column: installment
0.0


Missing values for column: annual_inc
0.0


Missing values for column: dti
0.002464003754672388


Missing values for column: delinq_2yrs
0.0


Missing values for column: fico_range_low
0.0


Missing values for column: fico_range_high
0.0


Missing values for column: mths_since_last_delinq
0.557614944937059


Missing values for column: mths_since_last_record
0.866629511054493


Missing values for column: open_acc
0.0


Missing values for column: revol_bal
0.0


Missing values for column: revol_util
0.0012864781508238488


Missing values for column: total_acc
0.0


Missing values for column: total_pymnt
0.0


Missing values for column: total_rec_prncp
0.0


Missing values for column: recoveries
0.0


Missing values for column: last_fico_range_high
0.0


Missing values for column: last_fico_range_low
0.0


Missing values for column: mths_since_last_major_derog
0.7660746911614341


Missing values for column: annual_inc_joint
0.8557719707001459


Missing values for column: dti_joint
0.8557719707001459


Missing values for column: tot_coll_amt
0.0


Missing values for column: tot_cur_bal
0.0


Missing values for column: open_act_il
0.0


Missing values for column: open_il_24m
0.0


Missing values for column: total_bal_il
0.0


Missing values for column: il_util
0.167770160411673


Missing values for column: open_rv_12m
0.0


Missing values for column: open_rv_24m
0.0


Missing values for column: total_rev_hi_lim
0.0


Missing values for column: total_cu_tl
0.0


Missing values for column: inq_last_12m
0.0


Missing values for column: acc_open_past_24mths
0.0


Missing values for column: avg_cur_bal
6.285723863960173e-05


Missing values for column: delinq_amnt
0.0


Missing values for column: mort_acc
0.0


Missing values for column: mths_since_recent_bc_dlq
0.8008850299200456


Missing values for column: mths_since_recent_inq
0.1225380914866156


Missing values for column: mths_since_recent_revol_delinq
0.7115816557434754


Missing values for column: num_accts_ever_120_pd
0.0


Missing values for column: num_rev_accts
0.0


Missing values for column: total_bal_ex_mort
0.0


Missing values for column: revol_bal_joint
0.8557719707001459


  • Drop duplicate records
In [36]:
dataset.shape
Out[36]:
(238636, 60)
In [37]:
# No duplicate record was identified -movign on
dataset = dataset.drop_duplicates()
dataset.shape
Out[37]:
(238636, 60)

Development and Performace Validation samples (Train test split) - Correlations & some more prunning¶

Before proceeding with any statistical evaluation or preparatory processing of the data, it is essential to define the Development (training) and Performance Validation (test) sets, in order to prevent the possibility of data leakage into the PV data. We do not want -under any circumstances- to proceed with any preparatory action that would make the algorithm perform better by using information from the entire data set.

  • Since the dataset is huge, we can use a small period as training data.

Hence:

  • The Dev sample (training) spans from 01/2018 to 03/2018, covering a 3month period of approved applications
  • The PV sample (test) refers to applications of 04/2018, covering 1month period of approved applications
In [38]:
for_training = dataset[(dataset.issue_d >= '2018-01-01 00:00:00') & (dataset.issue_d < '2018-04-01 00:00:00')]
for_test =     dataset[(dataset.issue_d >= '2018-04-01 00:00:00') & (dataset.issue_d < '2018-05-01 00:00:00')]
print(for_training.shape)
for_training.head()
(107864, 60)
Out[38]:
loan_amnt funded_amnt term installment emp_length home_ownership annual_inc verification_status issue_d loan_status purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt total_rec_prncp recoveries last_fico_range_high last_fico_range_low mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m total_bal_il il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_rev_accts pub_rec_bankruptcies total_bal_ex_mort revol_bal_joint Defaulted
0 5000.0 5000.0 36 months 186.82 8 years RENT 50000.0 Verified 2018-03-01 Current other OK 21.80 1.0 Jan-2009 665.0 669.0 0.0 9.0 NaN 5.0 0.0 116.0 23.2 18.0 2043.690000 1219.69 0.0 609.0 605.0 9.0 Individual NaN NaN 0.0 19344.0 0.0 2.0 0.0 1.0 14118.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 0.0 0.0 NaN 2.0 NaN 4.0 2.0 0.0 19344.0 NaN 0
1 15000.0 15000.0 36 months 483.45 2 years OWN 196000.0 Source Verified 2018-03-01 Current debt_consolidation FL 18.29 0.0 Jul-1998 700.0 704.0 0.0 65.0 NaN 19.0 0.0 24243.0 46.3 53.0 5301.420000 4121.50 0.0 694.0 690.0 NaN Individual NaN NaN 0.0 534954.0 4.0 3.0 2.0 2.0 113470.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 5.0 NaN 6.0 NaN 0.0 37.0 0.0 137713.0 NaN 0
2 11200.0 11200.0 60 months 367.82 < 1 year RENT 44000.0 Not Verified 2018-03-01 Current medical NH 43.97 1.0 Jul-2007 665.0 669.0 2.0 6.0 NaN 8.0 0.0 1526.0 24.6 14.0 4007.700000 1006.27 0.0 629.0 625.0 70.0 Joint App 81000.0 31.94 0.0 67173.0 1.0 4.0 1.0 4.0 65647.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 35.0 0.0 35.0 1.0 6.0 0.0 67173.0 7101.0 0
3 25000.0 25000.0 60 months 688.35 10+ years MORTGAGE 65000.0 Source Verified 2018-03-01 Current debt_consolidation AL 12.89 1.0 Mar-1995 665.0 669.0 1.0 22.0 NaN 7.0 0.0 8657.0 98.4 16.0 7511.160000 2811.27 0.0 669.0 665.0 23.0 Individual NaN NaN 0.0 74795.0 0.0 2.0 0.0 2.0 8382.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 2.0 NaN 0.0 22.0 2.0 9.0 0.0 17039.0 NaN 0
4 3000.0 3000.0 36 months 93.10 9 years RENT 52000.0 Source Verified 2018-03-01 Fully Paid major_purchase WA 0.58 0.0 Jan-1998 760.0 764.0 0.0 26.0 NaN 7.0 0.0 141.0 0.5 30.0 3011.577285 3000.00 0.0 764.0 760.0 NaN Individual NaN NaN 0.0 150592.0 0.0 0.0 1.0 2.0 0.0 NaN 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 4.0 NaN 7.0 NaN 0.0 19.0 0.0 141.0 NaN 0
In [39]:
print(for_test.shape)
for_test.head()
(42928, 60)
Out[39]:
loan_amnt funded_amnt term installment emp_length home_ownership annual_inc verification_status issue_d loan_status purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt total_rec_prncp recoveries last_fico_range_high last_fico_range_low mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m total_bal_il il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_rev_accts pub_rec_bankruptcies total_bal_ex_mort revol_bal_joint Defaulted
194739 9600.0 9600.0 36 months 332.98 10+ years MORTGAGE 145000.0 Verified 2018-04-01 Current other GA 25.99 1.0 Oct-1988 670.0 674.0 1.0 21.0 NaN 26.0 0.0 72673.0 83.3 35.0 3325.15 2250.64 0.0 669.0 665.0 NaN Individual NaN NaN 0.0 449697.0 1.0 2.0 1.0 2.0 27401.0 88.0 0.0 0.0 87200.0 0.0 4.0 3.0 17296.0 0.0 3.0 NaN 3.0 21.0 0.0 28.0 0.0 100074.0 NaN 0
194743 12800.0 12800.0 60 months 295.06 4 years RENT 40000.0 Not Verified 2018-04-01 Current debt_consolidation MD 38.28 0.0 Jul-1991 725.0 729.0 0.0 NaN NaN 24.0 0.0 18798.0 47.7 33.0 2940.94 1580.91 0.0 694.0 690.0 NaN Individual NaN NaN 0.0 38625.0 0.0 2.0 2.0 2.0 19827.0 56.0 0.0 5.0 39400.0 2.0 0.0 7.0 1756.0 0.0 0.0 NaN 16.0 NaN 0.0 25.0 0.0 38625.0 NaN 0
194747 20000.0 20000.0 60 months 486.47 10+ years MORTGAGE 70000.0 Not Verified 2018-04-01 Current debt_consolidation NE 3.60 0.0 Jan-2006 695.0 699.0 1.0 NaN NaN 5.0 0.0 8409.0 71.3 31.0 10242.59 8279.87 0.0 694.0 690.0 NaN Joint App 140000.0 1.80 0.0 183164.0 2.0 1.0 1.0 3.0 2314.0 101.0 0.0 1.0 11800.0 0.0 3.0 5.0 36633.0 0.0 2.0 NaN 5.0 NaN 0.0 15.0 0.0 10723.0 8409.0 0
194749 24775.0 24775.0 60 months 602.62 10+ years MORTGAGE 58000.0 Not Verified 2018-04-01 Current debt_consolidation MI 30.58 0.0 Aug-2002 720.0 724.0 0.0 NaN NaN 9.0 0.0 23769.0 70.3 24.0 6469.97 3199.93 0.0 714.0 710.0 NaN Joint App 120000.0 21.36 0.0 294371.0 0.0 3.0 1.0 2.0 42667.0 83.0 0.0 0.0 33800.0 9.0 2.0 2.0 32708.0 0.0 2.0 NaN 8.0 NaN 0.0 8.0 0.0 66436.0 56210.0 0
194750 2000.0 2000.0 36 months 64.46 3 years RENT 33600.0 Source Verified 2018-04-01 Current credit_card NH 11.57 0.0 Dec-2014 700.0 704.0 2.0 NaN NaN 5.0 0.0 2518.0 32.3 6.0 643.50 497.49 0.0 624.0 620.0 NaN Individual NaN NaN 0.0 97754.0 1.0 1.0 0.0 0.0 5339.0 38.0 1.0 3.0 7800.0 2.0 11.0 4.0 19551.0 0.0 1.0 NaN 1.0 NaN 0.0 3.0 0.0 7857.0 NaN 0
In [40]:
X_train , y_train = for_training.drop(columns=['Defaulted', 'loan_status']), for_training['Defaulted']
X_train.shape, y_train.shape
Out[40]:
((107864, 58), (107864,))
In [41]:
X_test , y_test = for_test.drop(columns=['Defaulted', 'loan_status']), for_test['Defaulted']
X_test.shape, y_test.shape
Out[41]:
((42928, 58), (42928,))
In [42]:
X_train
Out[42]:
loan_amnt funded_amnt term installment emp_length home_ownership annual_inc verification_status issue_d purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt total_rec_prncp recoveries last_fico_range_high last_fico_range_low mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m total_bal_il il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_rev_accts pub_rec_bankruptcies total_bal_ex_mort revol_bal_joint
0 5000.0 5000.0 36 months 186.82 8 years RENT 50000.00 Verified 2018-03-01 other OK 21.80 1.0 Jan-2009 665.0 669.0 0.0 9.0 NaN 5.0 0.0 116.0 23.2 18.0 2043.690000 1219.69 0.0 609.0 605.0 9.0 Individual NaN NaN 0.0 19344.0 0.0 2.0 0.0 1.0 14118.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 0.0 0.0 NaN 2.0 NaN 4.0 2.0 0.0 19344.0 NaN
1 15000.0 15000.0 36 months 483.45 2 years OWN 196000.00 Source Verified 2018-03-01 debt_consolidation FL 18.29 0.0 Jul-1998 700.0 704.0 0.0 65.0 NaN 19.0 0.0 24243.0 46.3 53.0 5301.420000 4121.50 0.0 694.0 690.0 NaN Individual NaN NaN 0.0 534954.0 4.0 3.0 2.0 2.0 113470.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 5.0 NaN 6.0 NaN 0.0 37.0 0.0 137713.0 NaN
2 11200.0 11200.0 60 months 367.82 < 1 year RENT 44000.00 Not Verified 2018-03-01 medical NH 43.97 1.0 Jul-2007 665.0 669.0 2.0 6.0 NaN 8.0 0.0 1526.0 24.6 14.0 4007.700000 1006.27 0.0 629.0 625.0 70.0 Joint App 81000.0 31.94 0.0 67173.0 1.0 4.0 1.0 4.0 65647.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 35.0 0.0 35.0 1.0 6.0 0.0 67173.0 7101.0
3 25000.0 25000.0 60 months 688.35 10+ years MORTGAGE 65000.00 Source Verified 2018-03-01 debt_consolidation AL 12.89 1.0 Mar-1995 665.0 669.0 1.0 22.0 NaN 7.0 0.0 8657.0 98.4 16.0 7511.160000 2811.27 0.0 669.0 665.0 23.0 Individual NaN NaN 0.0 74795.0 0.0 2.0 0.0 2.0 8382.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 2.0 NaN 0.0 22.0 2.0 9.0 0.0 17039.0 NaN
4 3000.0 3000.0 36 months 93.10 9 years RENT 52000.00 Source Verified 2018-03-01 major_purchase WA 0.58 0.0 Jan-1998 760.0 764.0 0.0 26.0 NaN 7.0 0.0 141.0 0.5 30.0 3011.577285 3000.00 0.0 764.0 760.0 NaN Individual NaN NaN 0.0 150592.0 0.0 0.0 1.0 2.0 0.0 NaN 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 4.0 NaN 7.0 NaN 0.0 19.0 0.0 141.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 12000.0 12000.0 60 months 270.71 NaN MORTGAGE 89625.39 Not Verified 2018-01-01 debt_consolidation CA 17.61 0.0 Jan-1982 660.0 664.0 0.0 40.0 NaN 10.0 0.0 18601.0 90.3 37.0 12483.154233 12000.00 0.0 689.0 685.0 40.0 Individual NaN NaN 0.0 473894.0 1.0 2.0 2.0 2.0 25536.0 88.0 1.0 1.0 20600.0 0.0 0.0 3.0 47389.0 0.0 1.0 40.0 NaN 40.0 16.0 27.0 0.0 44374.0 NaN
107860 4375.0 4375.0 36 months 149.70 10+ years MORTGAGE 52000.00 Not Verified 2018-01-01 home_improvement IL 33.72 0.0 Feb-1994 690.0 694.0 0.0 NaN NaN 22.0 0.0 28116.0 49.2 41.0 2092.380000 1487.25 0.0 684.0 680.0 NaN Individual NaN NaN 249.0 217780.0 1.0 2.0 0.0 2.0 22184.0 66.0 1.0 3.0 57200.0 0.0 0.0 5.0 9899.0 0.0 3.0 NaN 17.0 NaN 0.0 34.0 0.0 50300.0 NaN
107861 6000.0 6000.0 36 months 196.18 10+ years MORTGAGE 50000.00 Source Verified 2018-01-01 debt_consolidation NH 28.93 0.0 Jun-1997 690.0 694.0 0.0 58.0 NaN 11.0 0.0 6950.0 51.9 14.0 2742.880000 2104.37 0.0 819.0 815.0 58.0 Individual NaN NaN 0.0 230614.0 0.0 1.0 1.0 1.0 18497.0 83.0 1.0 1.0 13400.0 0.0 2.0 2.0 20965.0 0.0 2.0 58.0 7.0 58.0 1.0 11.0 0.0 25447.0 NaN
107862 12000.0 12000.0 36 months 389.58 8 years MORTGAGE 36000.00 Verified 2018-01-01 debt_consolidation IN 11.10 1.0 May-1998 685.0 689.0 0.0 21.0 NaN 14.0 0.0 11648.0 43.6 18.0 5593.050000 4383.16 0.0 694.0 690.0 21.0 Individual NaN NaN 0.0 191131.0 2.0 1.0 0.0 0.0 105786.0 NaN 3.0 6.0 26700.0 0.0 1.0 6.0 14702.0 0.0 1.0 NaN 11.0 NaN 1.0 12.0 0.0 117434.0 NaN
107863 14000.0 14000.0 36 months 475.71 2 years OWN 80000.00 Source Verified 2018-01-01 car CA 1.35 0.0 Jul-2007 660.0 664.0 1.0 31.0 NaN 11.0 0.0 1461.0 4.1 21.0 14662.947011 14000.00 0.0 674.0 670.0 31.0 Individual NaN NaN 0.0 1461.0 1.0 0.0 0.0 1.0 0.0 NaN 1.0 2.0 35300.0 0.0 2.0 3.0 162.0 0.0 0.0 31.0 0.0 31.0 5.0 19.0 0.0 1461.0 NaN

107864 rows × 58 columns

In [43]:
X_test
Out[43]:
loan_amnt funded_amnt term installment emp_length home_ownership annual_inc verification_status issue_d purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt total_rec_prncp recoveries last_fico_range_high last_fico_range_low mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m total_bal_il il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_rev_accts pub_rec_bankruptcies total_bal_ex_mort revol_bal_joint
194739 9600.0 9600.0 36 months 332.98 10+ years MORTGAGE 145000.0 Verified 2018-04-01 other GA 25.99 1.0 Oct-1988 670.0 674.0 1.0 21.0 NaN 26.0 0.0 72673.0 83.3 35.0 3325.15 2250.64 0.0 669.0 665.0 NaN Individual NaN NaN 0.0 449697.0 1.0 2.0 1.0 2.0 27401.0 88.0 0.0 0.0 87200.0 0.0 4.0 3.0 17296.0 0.0 3.0 NaN 3.0 21.0 0.0 28.0 0.0 100074.0 NaN
194743 12800.0 12800.0 60 months 295.06 4 years RENT 40000.0 Not Verified 2018-04-01 debt_consolidation MD 38.28 0.0 Jul-1991 725.0 729.0 0.0 NaN NaN 24.0 0.0 18798.0 47.7 33.0 2940.94 1580.91 0.0 694.0 690.0 NaN Individual NaN NaN 0.0 38625.0 0.0 2.0 2.0 2.0 19827.0 56.0 0.0 5.0 39400.0 2.0 0.0 7.0 1756.0 0.0 0.0 NaN 16.0 NaN 0.0 25.0 0.0 38625.0 NaN
194747 20000.0 20000.0 60 months 486.47 10+ years MORTGAGE 70000.0 Not Verified 2018-04-01 debt_consolidation NE 3.60 0.0 Jan-2006 695.0 699.0 1.0 NaN NaN 5.0 0.0 8409.0 71.3 31.0 10242.59 8279.87 0.0 694.0 690.0 NaN Joint App 140000.0 1.80 0.0 183164.0 2.0 1.0 1.0 3.0 2314.0 101.0 0.0 1.0 11800.0 0.0 3.0 5.0 36633.0 0.0 2.0 NaN 5.0 NaN 0.0 15.0 0.0 10723.0 8409.0
194749 24775.0 24775.0 60 months 602.62 10+ years MORTGAGE 58000.0 Not Verified 2018-04-01 debt_consolidation MI 30.58 0.0 Aug-2002 720.0 724.0 0.0 NaN NaN 9.0 0.0 23769.0 70.3 24.0 6469.97 3199.93 0.0 714.0 710.0 NaN Joint App 120000.0 21.36 0.0 294371.0 0.0 3.0 1.0 2.0 42667.0 83.0 0.0 0.0 33800.0 9.0 2.0 2.0 32708.0 0.0 2.0 NaN 8.0 NaN 0.0 8.0 0.0 66436.0 56210.0
194750 2000.0 2000.0 36 months 64.46 3 years RENT 33600.0 Source Verified 2018-04-01 credit_card NH 11.57 0.0 Dec-2014 700.0 704.0 2.0 NaN NaN 5.0 0.0 2518.0 32.3 6.0 643.50 497.49 0.0 624.0 620.0 NaN Individual NaN NaN 0.0 97754.0 1.0 1.0 0.0 0.0 5339.0 38.0 1.0 3.0 7800.0 2.0 11.0 4.0 19551.0 0.0 1.0 NaN 1.0 NaN 0.0 3.0 0.0 7857.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
238631 12000.0 12000.0 36 months 398.46 NaN MORTGAGE 60000.0 Not Verified 2018-04-01 major_purchase NV 28.64 0.0 Mar-2005 720.0 724.0 0.0 NaN 102.0 16.0 1.0 12025.0 38.2 22.0 3570.17 2610.46 0.0 739.0 735.0 NaN Individual NaN NaN 0.0 230207.0 0.0 2.0 0.0 0.0 28116.0 34.0 5.0 9.0 31500.0 0.0 2.0 9.0 15347.0 0.0 1.0 NaN 7.0 NaN 0.0 16.0 1.0 40141.0 NaN
238632 3000.0 3000.0 36 months 112.09 10+ years RENT 72000.0 Not Verified 2018-04-01 other NJ 5.87 0.0 Oct-2007 660.0 664.0 0.0 NaN NaN 11.0 0.0 5509.0 60.5 13.0 1117.50 659.82 0.0 499.0 0.0 NaN Individual NaN NaN 0.0 5509.0 1.0 0.0 0.0 0.0 0.0 NaN 3.0 9.0 9100.0 0.0 1.0 9.0 501.0 0.0 0.0 NaN 8.0 NaN 0.0 13.0 0.0 5509.0 NaN
238633 14000.0 14000.0 36 months 492.27 10+ years MORTGAGE 52874.0 Not Verified 2018-04-01 debt_consolidation CA 22.36 0.0 Sep-1995 660.0 664.0 1.0 NaN 87.0 9.0 1.0 25426.0 85.0 20.0 5402.52 3593.73 0.0 679.0 675.0 NaN Individual NaN NaN 0.0 32056.0 2.0 1.0 1.0 1.0 6630.0 88.0 1.0 2.0 29900.0 1.0 1.0 3.0 3561.0 0.0 0.0 NaN 6.0 NaN 0.0 19.0 1.0 32056.0 NaN
238634 7500.0 7500.0 36 months 245.19 10+ years RENT 126000.0 Not Verified 2018-04-01 other NY 6.33 0.0 Dec-1986 740.0 744.0 1.0 NaN NaN 16.0 0.0 11122.0 28.3 31.0 2688.01 2038.63 0.0 709.0 705.0 NaN Individual NaN NaN 0.0 132066.0 1.0 0.0 0.0 0.0 0.0 NaN 5.0 8.0 39300.0 0.0 2.0 11.0 8254.0 0.0 3.0 NaN 1.0 NaN 0.0 24.0 0.0 11122.0 NaN
238635 35000.0 35000.0 36 months 1065.88 10+ years RENT 110000.0 Source Verified 2018-04-01 debt_consolidation FL 9.80 0.0 Jul-1995 715.0 719.0 0.0 NaN NaN 13.0 0.0 39634.0 35.2 21.0 11712.88 10028.29 0.0 774.0 770.0 NaN Individual NaN NaN 0.0 39634.0 2.0 0.0 0.0 0.0 0.0 NaN 2.0 3.0 112500.0 0.0 0.0 3.0 3049.0 0.0 2.0 NaN NaN NaN 0.0 14.0 0.0 39634.0 NaN

42928 rows × 58 columns

  • Identify all the pairs of variables that are linearly correlated >=0.85 (in absolute terms)
In [44]:
continuous_discr = continuous.copy()
continuous_discr.extend(discrete)
# continuous_discr.remove('Defaulted')

# Computation of the correlation matrix
corr_matrix = X_train[continuous_discr].corr()

# Selecting the upper triangle of the correlation matrix
# Thus avoidign duplicate pairs (i.e. (A, B) and (B, A))
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find pairs with an absolute correlation coefficient ≥ 0.85
high_corr_pairs = upper_triangle.stack().reset_index()
high_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
high_corr_pairs = high_corr_pairs[high_corr_pairs['Correlation'].abs() >= 0.85]

print(high_corr_pairs)
                     Feature 1                       Feature 2  Correlation
0                    loan_amnt                     funded_amnt     1.000000
1                    loan_amnt                     installment     0.944957
49                 funded_amnt                     installment     0.944957
279             fico_range_low                 fico_range_high     1.000000
394     mths_since_last_delinq  mths_since_recent_revol_delinq     0.860612
595                total_pymnt                 total_rec_prncp     0.975128
697       last_fico_range_high             last_fico_range_low     0.876065
964               total_bal_il               total_bal_ex_mort     0.907411
1148  mths_since_recent_bc_dlq  mths_since_recent_revol_delinq     0.893902
In [45]:
# Will drop one for each pair respectively (hence will keep one per pair of correlated predictors).
# Will also have in mind the conceptual soundess of the analysis that will follow
drop_correlated_cols = [
"funded_amnt",
"loan_amnt",
"fico_range_high",
"mths_since_recent_revol_delinq",
"pub_rec_bankruptcies",
"total_rec_prncp",
"last_fico_range_low",
"total_bal_il",
"total_bal_ex_mort"
]

# Drop the columns from the train and test datasets
X_train = X_train.drop(columns=drop_correlated_cols)
X_test = X_test.drop(columns=drop_correlated_cols)

# confirm
print(X_train.shape)
print(X_test.shape)
(107864, 49)
(42928, 49)
  • Update the lists with the names of the remaining vars
In [46]:
# For categorical variables 
categorical = [var for var in X_train.columns if X_train[var].dtype == 'O']

# For discrete variables
discrete = [var for var in X_train.columns if X_train[var].dtype != 'O' 
                                                 and X_train[var].nunique() < 20
           ]

# For continuous variables
continuous = [var for var in X_train.columns if X_train[var].dtype != 'O' 
                                                and var not in discrete
             ]

print('Categorical Variables: ', categorical)
print('Discrete Variables: ', discrete)
print('Continuous Variables: ', continuous)
Categorical Variables:  ['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'earliest_cr_line', 'application_type']
Discrete Variables:  ['issue_d', 'inq_last_6mths', 'pub_rec', 'open_acc_6m', 'open_il_12m', 'open_il_24m']
Continuous Variables:  ['installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'recoveries', 'last_fico_range_high', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_act_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'total_rev_hi_lim', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'delinq_amnt', 'mort_acc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_rev_accts', 'revol_bal_joint']
In [47]:
# Remove also the "issue_d" from the list of discrete vars, since it does not provide any discriminatory power 
# (and if it would -it wouldnt be conceptually sound)
# Drop the columns from the train and test datasets
discrete.remove('issue_d')

Statistical Analysis¶

  • Descriptive statistics
In [48]:
X_train.describe()
Out[48]:
installment annual_inc issue_d dti delinq_2yrs fico_range_low inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt recoveries last_fico_range_high mths_since_last_major_derog annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq num_accts_ever_120_pd num_rev_accts revol_bal_joint
count 107864.000000 1.078640e+05 107864 107602.000000 107864.000000 107864.000000 107864.000000 47169.000000 15269.000000 107864.000000 107864.000000 107864.000000 107715.000000 107864.000000 107864.000000 107864.000000 107864.000000 24761.000000 1.633100e+04 16331.000000 1.078640e+05 1.078640e+05 107864.000000 107864.000000 107864.000000 107864.000000 89880.000000 107864.000000 107864.000000 1.078640e+05 107864.000000 107864.000000 107864.000000 107854.000000 107864.000000 107864.000000 21298.000000 94216.000000 107864.000000 107864.000000 16331.000000
mean 469.646694 7.854227e+04 2018-01-31 14:50:23.273752064 19.648209 0.223773 708.248303 0.461266 36.980008 79.156657 11.377151 0.163243 16093.382880 43.372762 22.618019 7694.295719 31.738431 705.486956 46.193167 1.255066e+05 19.661708 2.932763e+02 1.437632e+05 0.904936 2.655613 0.648252 1.494975 67.606019 1.221529 2.601331 3.835303e+04 1.458457 1.954786 4.376817 13884.263523 3.559640 1.392967 40.445723 7.390942 0.449242 12.970871 34531.574919
min 29.760000 0.000000e+00 2018-01-01 00:00:00 0.000000 0.000000 660.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 0.000000 1.800000e+04 0.000000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000
25% 254.560000 4.500000e+04 2018-01-01 00:00:00 11.230000 0.000000 680.000000 0.000000 19.000000 64.000000 7.000000 0.000000 5170.000000 23.500000 14.000000 3497.757500 0.000000 674.000000 30.000000 8.500000e+04 13.730000 0.000000e+00 2.649400e+04 0.000000 1.000000 0.000000 0.000000 53.000000 0.000000 1.000000 1.600000e+04 0.000000 0.000000 2.000000 2925.000000 0.000000 0.000000 23.000000 2.000000 0.000000 7.000000 15760.500000
50% 389.360000 6.500000e+04 2018-02-01 00:00:00 17.670000 0.000000 700.000000 0.000000 34.000000 83.000000 10.000000 0.000000 10657.000000 41.500000 21.000000 5744.400000 0.000000 709.000000 47.000000 1.120000e+05 19.200000 0.000000e+00 7.513550e+04 1.000000 2.000000 0.000000 1.000000 70.000000 1.000000 2.000000 2.850000e+04 0.000000 1.000000 4.000000 7169.500000 0.000000 1.000000 38.000000 6.000000 0.000000 11.000000 27652.000000
75% 637.647500 9.500000e+04 2018-03-01 00:00:00 25.020000 0.000000 730.000000 1.000000 53.000000 98.000000 14.000000 0.000000 19511.250000 61.800000 29.000000 9696.480000 0.000000 744.000000 63.000000 1.500000e+05 25.200000 0.000000e+00 2.173230e+05 1.000000 3.000000 1.000000 2.000000 84.000000 2.000000 4.000000 4.890000e+04 2.000000 3.000000 6.000000 19215.000000 0.000000 2.000000 57.000000 11.000000 0.000000 17.000000 45080.500000
max 1618.030000 8.365188e+06 2018-03-01 00:00:00 999.000000 20.000000 845.000000 5.000000 226.000000 124.000000 69.000000 52.000000 925589.000000 191.000000 153.000000 51653.389338 21251.790000 850.000000 226.000000 1.187000e+06 39.980000 6.214661e+06 5.752177e+06 12.000000 41.000000 8.000000 20.000000 1000.000000 19.000000 38.000000 1.123500e+06 52.000000 46.000000 38.000000 620531.000000 65000.000000 46.000000 194.000000 24.000000 37.000000 151.000000 371153.000000
std 289.215801 7.687436e+04 NaN 21.795902 0.730417 37.205453 0.746153 21.802486 24.817082 5.879388 0.494799 22176.260283 25.149181 12.027177 6738.872144 334.216920 59.401365 21.555390 6.681665e+04 8.038172 2.481319e+04 1.676133e+05 1.122125 2.930210 0.912269 1.531529 24.141432 1.474057 2.527823 3.671479e+04 2.632854 2.372746 3.191143 17487.395429 324.829448 1.751895 22.257444 6.014327 1.335028 7.885984 28087.749846

Diagnostic plots for the Discrete variables

In [49]:
for col in discrete:
    plt.figure(figsize=(15,4))
    
    # First plot: Distribution plot
    plt.subplot(121)  # Change to 121 to arrange plots side by side
    sns.histplot(X_train[col], kde=True, label='skew' + str(np.round(X_train[col].skew(), 2)))
    plt.legend()
    
    # Second plot: Box plot
    plt.subplot(122)  # Change to 122 to arrange plots side by side
    sns.boxplot(x=X_train[col])
    
    plt.tight_layout()
    plt.show()

Diagnostic plots for the Continuous variables

In [50]:
for col in continuous:
    plt.figure(figsize=(15,4))
    
    # First plot: Distribution plot
    plt.subplot(121)  # Change to 121 to arrange plots side by side
    sns.histplot(X_train[col], kde=True, label='skew' + str(np.round(X_train[col].skew(), 2)))
    plt.legend()
    
    # Second plot: Box plot
    plt.subplot(122)  # Change to 122 to arrange plots side by side
    sns.boxplot(x=X_train[col])
    
    plt.tight_layout()
    plt.show()

For IV/Segmentation analysis¶

In [51]:
X_train_for_segm = X_train.reset_index(drop=True)
y_train_for_segm = y_train.reset_index(drop=True)

# Merge the DataFrames using the index
for_segmentation = X_train_for_segm.merge(y_train_for_segm, left_index=True, right_index=True)

for_segmentation.head()
Out[51]:
term installment emp_length home_ownership annual_inc verification_status issue_d purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt recoveries last_fico_range_high mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq num_accts_ever_120_pd num_rev_accts revol_bal_joint Defaulted
0 36 months 186.82 8 years RENT 50000.0 Verified 2018-03-01 other OK 21.80 1.0 Jan-2009 665.0 0.0 9.0 NaN 5.0 0.0 116.0 23.2 18.0 2043.690000 0.0 609.0 9.0 Individual NaN NaN 0.0 19344.0 0.0 2.0 0.0 1.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 0.0 0.0 NaN 2.0 4.0 2.0 NaN 0
1 36 months 483.45 2 years OWN 196000.0 Source Verified 2018-03-01 debt_consolidation FL 18.29 0.0 Jul-1998 700.0 0.0 65.0 NaN 19.0 0.0 24243.0 46.3 53.0 5301.420000 0.0 694.0 NaN Individual NaN NaN 0.0 534954.0 4.0 3.0 2.0 2.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 5.0 NaN 6.0 0.0 37.0 NaN 0
2 60 months 367.82 < 1 year RENT 44000.0 Not Verified 2018-03-01 medical NH 43.97 1.0 Jul-2007 665.0 2.0 6.0 NaN 8.0 0.0 1526.0 24.6 14.0 4007.700000 0.0 629.0 70.0 Joint App 81000.0 31.94 0.0 67173.0 1.0 4.0 1.0 4.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 35.0 0.0 1.0 6.0 7101.0 0
3 60 months 688.35 10+ years MORTGAGE 65000.0 Source Verified 2018-03-01 debt_consolidation AL 12.89 1.0 Mar-1995 665.0 1.0 22.0 NaN 7.0 0.0 8657.0 98.4 16.0 7511.160000 0.0 669.0 23.0 Individual NaN NaN 0.0 74795.0 0.0 2.0 0.0 2.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 2.0 NaN 0.0 2.0 9.0 NaN 0
4 36 months 93.10 9 years RENT 52000.0 Source Verified 2018-03-01 major_purchase WA 0.58 0.0 Jan-1998 760.0 0.0 26.0 NaN 7.0 0.0 141.0 0.5 30.0 3011.577285 0.0 764.0 NaN Individual NaN NaN 0.0 150592.0 0.0 0.0 1.0 2.0 NaN 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 4.0 NaN 7.0 0.0 19.0 NaN 0
In [52]:
# Fill missing values accordingly
for_segmentation[categorical] = for_segmentation[categorical].fillna('MISSING')
for_segmentation[discrete] = for_segmentation[discrete].fillna(-1)
for_segmentation[continuous] = for_segmentation[continuous].fillna(-1)
In [53]:
for_segmentation['dti_rounded'] = np.round(for_segmentation['dti'],0).astype(int)
for_segmentation
Out[53]:
term installment emp_length home_ownership annual_inc verification_status issue_d purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt recoveries last_fico_range_high mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq num_accts_ever_120_pd num_rev_accts revol_bal_joint Defaulted dti_rounded
0 36 months 186.82 8 years RENT 50000.00 Verified 2018-03-01 other OK 21.80 1.0 Jan-2009 665.0 0.0 9.0 -1.0 5.0 0.0 116.0 23.2 18.0 2043.690000 0.0 609.0 9.0 Individual -1.0 -1.00 0.0 19344.0 0.0 2.0 0.0 1.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 0.0 0.0 -1.0 2.0 4.0 2.0 -1.0 0 22
1 36 months 483.45 2 years OWN 196000.00 Source Verified 2018-03-01 debt_consolidation FL 18.29 0.0 Jul-1998 700.0 0.0 65.0 -1.0 19.0 0.0 24243.0 46.3 53.0 5301.420000 0.0 694.0 -1.0 Individual -1.0 -1.00 0.0 534954.0 4.0 3.0 2.0 2.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 5.0 -1.0 6.0 0.0 37.0 -1.0 0 18
2 60 months 367.82 < 1 year RENT 44000.00 Not Verified 2018-03-01 medical NH 43.97 1.0 Jul-2007 665.0 2.0 6.0 -1.0 8.0 0.0 1526.0 24.6 14.0 4007.700000 0.0 629.0 70.0 Joint App 81000.0 31.94 0.0 67173.0 1.0 4.0 1.0 4.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 35.0 0.0 1.0 6.0 7101.0 0 44
3 60 months 688.35 10+ years MORTGAGE 65000.00 Source Verified 2018-03-01 debt_consolidation AL 12.89 1.0 Mar-1995 665.0 1.0 22.0 -1.0 7.0 0.0 8657.0 98.4 16.0 7511.160000 0.0 669.0 23.0 Individual -1.0 -1.00 0.0 74795.0 0.0 2.0 0.0 2.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 2.0 -1.0 0.0 2.0 9.0 -1.0 0 13
4 36 months 93.10 9 years RENT 52000.00 Source Verified 2018-03-01 major_purchase WA 0.58 0.0 Jan-1998 760.0 0.0 26.0 -1.0 7.0 0.0 141.0 0.5 30.0 3011.577285 0.0 764.0 -1.0 Individual -1.0 -1.00 0.0 150592.0 0.0 0.0 1.0 2.0 -1.0 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 4.0 -1.0 7.0 0.0 19.0 -1.0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 60 months 270.71 MISSING MORTGAGE 89625.39 Not Verified 2018-01-01 debt_consolidation CA 17.61 0.0 Jan-1982 660.0 0.0 40.0 -1.0 10.0 0.0 18601.0 90.3 37.0 12483.154233 0.0 689.0 40.0 Individual -1.0 -1.00 0.0 473894.0 1.0 2.0 2.0 2.0 88.0 1.0 1.0 20600.0 0.0 0.0 3.0 47389.0 0.0 1.0 40.0 -1.0 16.0 27.0 -1.0 0 18
107860 36 months 149.70 10+ years MORTGAGE 52000.00 Not Verified 2018-01-01 home_improvement IL 33.72 0.0 Feb-1994 690.0 0.0 -1.0 -1.0 22.0 0.0 28116.0 49.2 41.0 2092.380000 0.0 684.0 -1.0 Individual -1.0 -1.00 249.0 217780.0 1.0 2.0 0.0 2.0 66.0 1.0 3.0 57200.0 0.0 0.0 5.0 9899.0 0.0 3.0 -1.0 17.0 0.0 34.0 -1.0 0 34
107861 36 months 196.18 10+ years MORTGAGE 50000.00 Source Verified 2018-01-01 debt_consolidation NH 28.93 0.0 Jun-1997 690.0 0.0 58.0 -1.0 11.0 0.0 6950.0 51.9 14.0 2742.880000 0.0 819.0 58.0 Individual -1.0 -1.00 0.0 230614.0 0.0 1.0 1.0 1.0 83.0 1.0 1.0 13400.0 0.0 2.0 2.0 20965.0 0.0 2.0 58.0 7.0 1.0 11.0 -1.0 0 29
107862 36 months 389.58 8 years MORTGAGE 36000.00 Verified 2018-01-01 debt_consolidation IN 11.10 1.0 May-1998 685.0 0.0 21.0 -1.0 14.0 0.0 11648.0 43.6 18.0 5593.050000 0.0 694.0 21.0 Individual -1.0 -1.00 0.0 191131.0 2.0 1.0 0.0 0.0 -1.0 3.0 6.0 26700.0 0.0 1.0 6.0 14702.0 0.0 1.0 -1.0 11.0 1.0 12.0 -1.0 0 11
107863 36 months 475.71 2 years OWN 80000.00 Source Verified 2018-01-01 car CA 1.35 0.0 Jul-2007 660.0 1.0 31.0 -1.0 11.0 0.0 1461.0 4.1 21.0 14662.947011 0.0 674.0 31.0 Individual -1.0 -1.00 0.0 1461.0 1.0 0.0 0.0 1.0 -1.0 1.0 2.0 35300.0 0.0 2.0 3.0 162.0 0.0 0.0 31.0 0.0 5.0 19.0 -1.0 0 1

107864 rows × 51 columns

  • Round float variables
In [54]:
for_segmentation['dti_rounded'] = np.round(for_segmentation['dti']).astype(int)
for_segmentation['instlmnt_round'] = np.round(for_segmentation['installment']).astype(int)
for_segmentation['Annual_Inc_round'] = np.round(for_segmentation['annual_inc']).astype(int)
for_segmentation['instlmnt_to_Annual_Inc'] = np.round(for_segmentation['instlmnt_round']/for_segmentation['Annual_Inc_round'])
for_segmentation['revol_bal_round'] = np.round(for_segmentation['revol_bal']).astype(int)
for_segmentation['revol_util_round'] = np.round(for_segmentation['revol_util']).astype(int)
for_segmentation['total_pymnt_round'] = np.round(for_segmentation['total_pymnt']).astype(int)

for_segmentation['recoveries_round'] = np.round(for_segmentation['recoveries']).astype(int)

for_segmentation['avg_cur_bal_round'] = np.round(for_segmentation['avg_cur_bal']).astype(int)
  • "years_with_Credit_line" variable creation
In [55]:
# Convert the date_column to datetime format
for_segmentation['earliest_cr_line'] = pd.to_datetime(for_segmentation['earliest_cr_line'], format='%b-%Y')

# Calculate the difference in years with the year 2018 (applications received during 2018)
for_segmentation['years_with_Credit_line'] = 2018 - for_segmentation['earliest_cr_line'].dt.year
for_segmentation['years_with_Credit_line'] 
Out[55]:
0          9
1         20
2         11
3         23
4         20
          ..
107859    36
107860    24
107861    21
107862    20
107863    11
Name: years_with_Credit_line, Length: 107864, dtype: int32
In [56]:
def analyze_column_and_export(for_segmentation, column_name, output_file):
    # Ensure the necessary columns are available
    if 'Defaulted' not in for_segmentation.columns:
        raise ValueError("The dataset must contain a 'Defaulted' column.")

    # Calculate "Percent of Total Frequency" for each unique value
    total_count = for_segmentation[column_name].value_counts(normalize=True) * 100
    
    # Calculate the "Frequency Count" of Goods (Defaulted = 0)
    goods_count = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts()
    
    # Calculate "Percent of Column Frequency" for the Goods
    goods_percent = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts(normalize=True) * 100
    
    # Calculate the "Frequency Count" of the Bads (Defaulted = 1)
    bads_count = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts()
    
    # Calculate "Percent of Column Frequency" for the Bads
    bads_percent = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts(normalize=True) * 100
    
    # Create a DataFrame with the calculated values
    summary_df = pd.DataFrame({
        column_name: total_count.index,
        'Percent of Total Frequency': total_count.values,
        'Frequency Count (Goods)': goods_count,
        'Percent of Column Frequency (Goods)': goods_percent,
        'Frequency Count (Bads)': bads_count,
        'Percent of Column Frequency (Bads)': bads_percent,
    }).fillna(0)
    
    # Calculate the good-bad odds
    summary_df['Good-Bad Odds'] = summary_df['Frequency Count (Goods)'] / summary_df['Frequency Count (Bads)']
    summary_df['Good-Bad Odds'].replace([np.inf, -np.inf], 0, inplace=True)
    
    # Calculate the bad rate and convert it to a percentage
    summary_df['Bad Rate'] = (summary_df['Frequency Count (Bads)'] / (summary_df['Frequency Count (Goods)'] + summary_df['Frequency Count (Bads)'])) * 100
    
    # Calculate the weight of evidence (WOE)
    summary_df['WOE'] = np.log(np.where(summary_df['Percent of Column Frequency (Bads)'] == 0, np.nan, summary_df['Percent of Column Frequency (Goods)'] / summary_df['Percent of Column Frequency (Bads)']))
    
    # Replace -inf, inf and NaN in WOE with 0
    summary_df['WOE'].replace([np.inf, -np.inf], 0, inplace=True)
    summary_df['WOE'].fillna(0, inplace=True)
    
    # Calculate Information Value (IV)
    summary_df['IV'] = (summary_df['Percent of Column Frequency (Goods)'] - summary_df['Percent of Column Frequency (Bads)']) * summary_df['WOE'] 
    
    # Fillna with 0 in IV
    summary_df['IV'].fillna(0, inplace=True)
    
    # Add the 'grpchar' column as a copy of the input column
    summary_df['grpchar'] = summary_df[column_name]
    
    # Reset index to avoid ambiguity
    summary_df.reset_index(drop=True, inplace=True)
    
    # Sort the DataFrame based on the specified column in ascending order
    summary_df.sort_values(by=column_name, inplace=True)
    
    # Save the summary DataFrame to an Excel file
    summary_df.to_excel(output_file, index=False)
  • To find optimal segmentations using excel - based on the IVs
In [57]:
# Analyze each column individualy

# Run for each below predictor independently -to obtain the "For_IV_Segm" excel file

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_recent_inq', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'total_acc', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'il_util', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'num_rev_accts', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mort_acc', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'application_type', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_rv_12m', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'acc_open_past_24mths', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_rv_24m', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_acc_6m', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'inq_last_12m', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'recoveries_round', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_il_12m', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'last_fico_range_high', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_major_derog', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'fico_range_low', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'revol_util', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'term', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'emp_length', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'dti_rounded', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'instlmnt_to_Annual_Inc', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'home_ownership', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'verification_status', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'purpose', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'years_with_Credit_line', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'inq_last_6mths', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_delinq', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_record', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_acc', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'pub_rec', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'revol_bal', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'term', output_file)

output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'dti_rounded', output_file)

Create the segmented characteristics¶

  • Data preparation
In [58]:
# Fill missing values accordingly

#####################################################################################
#####################################################################################
                                # TRAIN SET
#####################################################################################
#####################################################################################

X_train[categorical] = X_train[categorical].fillna('MISSING')
X_train[discrete] = X_train[discrete].fillna(-1)
X_train[continuous] = X_train[continuous].fillna(-1)


X_train['dti_rounded'] = np.round(X_train['dti']).astype(int)
X_train['instlmnt_round'] = np.round(X_train['installment']).astype(int)
X_train['Annual_Inc_round'] = np.round(X_train['annual_inc']).astype(int)
X_train['instlmnt_to_Annual_Inc'] = np.round(X_train['instlmnt_round']/X_train['Annual_Inc_round'])
X_train['revol_bal_round'] = np.round(X_train['revol_bal']).astype(int)
X_train['revol_util_round'] = np.round(X_train['revol_util']).astype(int)
# X_train['out_prncp_round'] = np.round(X_train['out_prncp']).astype(int)
X_train['total_pymnt_round'] = np.round(X_train['total_pymnt']).astype(int)

X_train['recoveries_round'] = np.round(X_train['recoveries']).astype(int)

X_train['avg_cur_bal_round'] = np.round(X_train['avg_cur_bal']).astype(int)


# Convert the date_column to datetime format
X_train['earliest_cr_line'] = pd.to_datetime(X_train['earliest_cr_line'], format='%b-%Y')

# Calculate the difference in years with the year 2018 (applications received during 2018)
X_train['years_with_Credit_line'] = 2018 - X_train['earliest_cr_line'].dt.year
X_train['years_with_Credit_line'] 




#####################################################################################
#####################################################################################
                                # TEST SET
#####################################################################################
#####################################################################################
X_test[categorical] = X_test[categorical].fillna('MISSING')
X_test[discrete] = X_test[discrete].fillna(-1)
X_test[continuous] = X_test[continuous].fillna(-1)

X_test['dti_rounded'] = np.round(X_test['dti'],0).astype(int)

X_test['dti_rounded'] = np.round(X_test['dti']).astype(int)
X_test['instlmnt_round'] = np.round(X_test['installment']).astype(int)
X_test['Annual_Inc_round'] = np.round(X_test['annual_inc']).astype(int)
X_test['instlmnt_to_Annual_Inc'] = np.round(X_test['instlmnt_round']/X_test['Annual_Inc_round'])
X_test['revol_bal_round'] = np.round(X_test['revol_bal']).astype(int)
X_test['revol_util_round'] = np.round(X_test['revol_util']).astype(int)
# X_test['out_prncp_round'] = np.round(X_test['out_prncp']).astype(int)
X_test['total_pymnt_round'] = np.round(X_test['total_pymnt']).astype(int)

X_test['recoveries_round'] = np.round(X_test['recoveries']).astype(int)

X_test['avg_cur_bal_round'] = np.round(X_test['avg_cur_bal']).astype(int)


# Convert the date_column to datetime format
X_test['earliest_cr_line'] = pd.to_datetime(X_test['earliest_cr_line'], format='%b-%Y')

# Calculate the difference in years with the year 2018 (applications received during 2018)
X_test['years_with_Credit_line'] = 2018 - X_test['earliest_cr_line'].dt.year
  • The new columns will be names as "_segm" in the end of each name correspondingly
In [59]:
# Function to segment variables using if-else
def segment_variables(df):
    # mths_since_recent_inq
    df['mths_since_recent_inq_segm'] = df['mths_since_recent_inq'].apply(
        lambda x: -1 if x == -1 else
                   1 if 0 <= x <= 2 else
                   2 if 3 <= x <= 10 else
                   3
    ).astype(int)

    # delinq_2yrs
    df['delinq_2yrs_segm'] = df['delinq_2yrs'].apply(
        lambda x: 1 if x >= 3 else
                  2 if x == 2 else
                  3 if x == 1 else
                  4
    ).astype(int)

    # fico_range_low
    df['fico_range_low_segm'] = df['fico_range_low'].apply(
        lambda x: 1 if x <= 680 else
                  2 if 685 <= x <= 700 else
                  3 if 705 <= x <= 720 else
                  4 if 725 <= x <= 790 else
                  5                         )

 # term
#     term_map = {
#         ' 60 months': 1,
#         ' 36 months': 2
#     }
#     df['term_segm'] = df['term'].map(term_map).fillna(-1).astype(int)


    df['term_segm'] = df['term'].apply(
        lambda x: 1 if x ==' 60 months' else
                  2    ).fillna(-1).astype(int)


    # emp_length
    emp_length_map = {
        'MISSING': -1,
        '10+ years': 1,
        '9 years': 1,
        '1 year':  2,
        '2 years': 2,
        '3 years': 2,
        '4 years': 2,
        '5 years': 2,
        '6 years': 2,
        '7 years': 2,
        '8 years': 2,
        '< 1 year': 2
    }
    df['emp_length_segm'] = df['emp_length'].map(emp_length_map).fillna(-1).astype(int)

    # dti_rounded
    df['dti_rounded_segm'] = df['dti_rounded'].apply(
        lambda x: -1 if x == -1 else
                   1 if 0 <= x <= 7 else
                   2 
    ).astype(int)

    # Inc_Rounded
    df['Annual_Inc_round_segm'] = df['Annual_Inc_round'].apply(
        lambda x: 1 if x <= 84999 else
                  2 if 85000 <= x <= 109999 else
                  3 if 110000 <= x <= 214999 else
                  4
    ).astype(int)

    # Instalment_rounded
    df['instlmnt_round_segm'] = df['instlmnt_round'].apply(
        lambda x: 1 if x <= 479 else
                  2 if 480 <= x <= 699 else
                  3 if 700 <= x <= 879 else
                  4
    ).astype(int)

    # home_ownership
    home_ownership_map = {
        'ANY': 1, 
        'OWN': 1,
        'MORTGAGE': 2,
        'RENT': 2
    }
    df['home_ownership_segm'] = df['home_ownership'].map(home_ownership_map).fillna(-1).astype(int)

    # status_verified
    status_verified_map = {
        'Not Verified': 3,
        'Source Verified': 2,
        'Verified': 1
    }
    df['verification_status_segm'] = df['verification_status'].map(status_verified_map).fillna(-1).astype(int)

    # purpose
    purpose_map = {
        'car': 1,
        'house': 1,
        'major_purchase': 1,
        'medical': 1,
        'moving': 1,
        'small_business': 1,        
        'debt_consolidation': 2,
        'home_improvement': 2,
        'other': 2, 
        'renewable_energy': 2,
        'vacation': 2,
        'wedding': 2,
        'credit_card': 3
    }
    df['purpose_segm'] = df['purpose'].map(purpose_map).fillna(-1).astype(int)

    # years_with_Credit_line
    df['years_with_Credit_line_segm'] = df['years_with_Credit_line'].apply(
        lambda x: 1 if 10 <= x <= 18 else
                  1 if x >= 40 else
                  2
    ).astype(int)

    # inq_last_6mths
    df['inq_last_6mths_segm'] = df['inq_last_6mths'].apply(
        lambda x: 1 if x >= 3 else
                  2 if x == 2 else
                  3 if x == 1 else
                  4
    ).astype(int)

    # mths_since_last_delinq
    df['mths_since_last_delinq_segm'] = df['mths_since_last_delinq'].apply(
        lambda x: -1 if x == -1 else
                  1 if 0 <= x <= 45 else
                  2 
    ).astype(int)

    # mths_since_last_record
    df['mths_since_last_record_segm'] = df['mths_since_last_record'].apply(
        lambda x: -1 if x == -1 else
                   1 if 1 <= x <= 68 else
                   2
    ).astype(int)

    # revol_bal_grouped
    df['revol_bal_segm'] = df['revol_bal'].apply(
        lambda x: 1 if x <= 9999 else
                  2 if 10000 <= x <= 19999 else
                  3 if 20000 <= x <= 39999 else
                  4
    ).astype(int)

    # out_prncp_round
#     df['out_prncp_round_segm'] = df['out_prncp_round'].apply(
#         lambda x: 1 if x > 0 else
#                   2
#     ).astype(int)

    # total_pymnt_round
    df['total_pymnt_round_segm'] = df['total_pymnt_round'].apply(
        lambda x: 1 if x <= 4999 else
                  2 if 5000 <= x <= 9999 else
                  3 if 10000 <= x <= 14999 else
                  4
    ).astype(int)

    # mths_since_last_major_derog
    df['mths_since_last_major_derog_segm'] = df['mths_since_last_major_derog'].apply(
        lambda x: -1 if x == -1 else
                  1 if 0 <= x <= 45 else
                  2 if 46 <= x <= 78 else
                  3
    ).astype(int)

    # application_type
    application_type_map = {
        'Joint_App': 1,
        'Individual': 2
    }
    df['application_type_segm'] = df['application_type'].map(application_type_map).fillna(-1).astype(int)
    

    # open_acc_6m
    df['open_acc_6m_segm'] = df['open_acc_6m'].apply(
        lambda x:  1 if x <= 1 else
                   2 if x > 1 else
                  -1
    ).astype(int)

    # open_il_12m
    df['open_il_12m_segm'] = df['open_il_12m'].apply(
        lambda x: 1 if x <= 1 else
                  2 if x > 1 else
                 -1
    ).astype(int)

    # open_rv_12m
    df['open_rv_12m_segm'] = df['open_rv_12m'].apply(
        lambda x: 1 if x <= 1 else
                  2 if x > 1 else
                  -1
    ).astype(int)

    # open_rv_24m
    df['open_rv_24m_segm'] = df['open_rv_24m'].apply(
        lambda x: 1 if x <= 1 else
                  2 if 2 <= x <= 6 else
                  3 if x >= 7 else
                 -1
    ).astype(int)

    # inq_last_12m
    df['inq_last_12m_segm'] = df['inq_last_12m'].apply(
        lambda x: 1 if x <= 1 else
                  2 if 2 <= x <= 6 else
                  3 if x >= 7 else
                  -1
    ).astype(int)

    # acc_open_past_24mths
    df['acc_open_past_24mths_segm'] = df['acc_open_past_24mths'].apply(
        lambda x: 1 if x <= 1 else
                  2 if 2 <= x <= 6 else
                  3 if x >= 7 else
                  -1
    ).astype(int).astype(int)

    # mort_acc
    df['mort_acc_segm'] = df['mort_acc'].apply(
        lambda x: 1 if x == 0 else
                  2 if x == 1 else
                  3 if x == 2 else
                  4
    ).astype(int)

    return df

# Apply the segmentations - TRAINING set
X_train_segmented = segment_variables(X_train)

# Apply the segmentations - TEST set
X_test_segmented = segment_variables(X_test)

X_train_segmented
Out[59]:
term installment emp_length home_ownership annual_inc verification_status issue_d purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc total_pymnt recoveries last_fico_range_high mths_since_last_major_derog application_type annual_inc_joint dti_joint tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m il_util open_rv_12m open_rv_24m total_rev_hi_lim total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal delinq_amnt mort_acc mths_since_recent_bc_dlq mths_since_recent_inq num_accts_ever_120_pd num_rev_accts revol_bal_joint dti_rounded instlmnt_round Annual_Inc_round instlmnt_to_Annual_Inc revol_bal_round revol_util_round total_pymnt_round recoveries_round avg_cur_bal_round years_with_Credit_line mths_since_recent_inq_segm delinq_2yrs_segm fico_range_low_segm term_segm emp_length_segm dti_rounded_segm Annual_Inc_round_segm instlmnt_round_segm home_ownership_segm verification_status_segm purpose_segm years_with_Credit_line_segm inq_last_6mths_segm mths_since_last_delinq_segm mths_since_last_record_segm revol_bal_segm total_pymnt_round_segm mths_since_last_major_derog_segm application_type_segm open_acc_6m_segm open_il_12m_segm open_rv_12m_segm open_rv_24m_segm inq_last_12m_segm acc_open_past_24mths_segm mort_acc_segm
0 36 months 186.82 8 years RENT 50000.00 Verified 2018-03-01 other OK 21.80 1.0 2009-01-01 665.0 0.0 9.0 -1.0 5.0 0.0 116.0 23.2 18.0 2043.690000 0.0 609.0 9.0 Individual -1.0 -1.00 0.0 19344.0 0.0 2.0 0.0 1.0 51.0 1.0 2.0 500.0 0.0 5.0 3.0 3869.0 0.0 0.0 -1.0 2.0 4.0 2.0 -1.0 22 187 50000 0.0 116 23 2044 0 3869 9 1 3 1 2 2 2 1 1 2 1 2 2 4 1 -1 1 1 1 2 1 1 1 2 2 2 1
1 36 months 483.45 2 years OWN 196000.00 Source Verified 2018-03-01 debt_consolidation FL 18.29 0.0 1998-07-01 700.0 0.0 65.0 -1.0 19.0 0.0 24243.0 46.3 53.0 5301.420000 0.0 694.0 -1.0 Individual -1.0 -1.00 0.0 534954.0 4.0 3.0 2.0 2.0 59.0 4.0 12.0 52400.0 1.0 7.0 15.0 31468.0 0.0 5.0 -1.0 6.0 0.0 37.0 -1.0 18 483 196000 0.0 24243 46 5301 0 31468 20 2 4 2 2 2 2 3 2 1 2 2 2 4 2 -1 3 2 -1 2 2 2 2 3 3 3 4
2 60 months 367.82 < 1 year RENT 44000.00 Not Verified 2018-03-01 medical NH 43.97 1.0 2007-07-01 665.0 2.0 6.0 -1.0 8.0 0.0 1526.0 24.6 14.0 4007.700000 0.0 629.0 70.0 Joint App 81000.0 31.94 0.0 67173.0 1.0 4.0 1.0 4.0 89.0 1.0 1.0 6200.0 1.0 10.0 5.0 8397.0 0.0 0.0 35.0 0.0 1.0 6.0 7101.0 44 368 44000 0.0 1526 25 4008 0 8397 11 1 3 1 1 2 2 1 1 2 3 1 1 2 1 -1 1 1 2 -1 1 1 1 1 3 2 1
3 60 months 688.35 10+ years MORTGAGE 65000.00 Source Verified 2018-03-01 debt_consolidation AL 12.89 1.0 1995-03-01 665.0 1.0 22.0 -1.0 7.0 0.0 8657.0 98.4 16.0 7511.160000 0.0 669.0 23.0 Individual -1.0 -1.00 0.0 74795.0 0.0 2.0 0.0 2.0 82.0 0.0 0.0 8800.0 3.0 3.0 2.0 10685.0 0.0 2.0 -1.0 0.0 2.0 9.0 -1.0 13 688 65000 0.0 8657 98 7511 0 10685 23 1 3 1 1 1 2 1 2 2 2 2 2 3 1 -1 1 2 1 2 1 1 1 1 2 2 3
4 36 months 93.10 9 years RENT 52000.00 Source Verified 2018-03-01 major_purchase WA 0.58 0.0 1998-01-01 760.0 0.0 26.0 -1.0 7.0 0.0 141.0 0.5 30.0 3011.577285 0.0 764.0 -1.0 Individual -1.0 -1.00 0.0 150592.0 0.0 0.0 1.0 2.0 -1.0 0.0 1.0 31000.0 2.0 2.0 3.0 25099.0 0.0 4.0 -1.0 7.0 0.0 19.0 -1.0 1 93 52000 0.0 141 0 3012 0 25099 20 2 4 4 2 1 1 1 1 2 2 1 2 4 1 -1 1 1 -1 2 1 1 1 1 2 2 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 60 months 270.71 MISSING MORTGAGE 89625.39 Not Verified 2018-01-01 debt_consolidation CA 17.61 0.0 1982-01-01 660.0 0.0 40.0 -1.0 10.0 0.0 18601.0 90.3 37.0 12483.154233 0.0 689.0 40.0 Individual -1.0 -1.00 0.0 473894.0 1.0 2.0 2.0 2.0 88.0 1.0 1.0 20600.0 0.0 0.0 3.0 47389.0 0.0 1.0 40.0 -1.0 16.0 27.0 -1.0 18 271 89625 0.0 18601 90 12483 0 47389 36 -1 4 1 1 -1 2 2 1 2 3 2 2 4 1 -1 2 3 1 2 1 2 1 1 1 2 2
107860 36 months 149.70 10+ years MORTGAGE 52000.00 Not Verified 2018-01-01 home_improvement IL 33.72 0.0 1994-02-01 690.0 0.0 -1.0 -1.0 22.0 0.0 28116.0 49.2 41.0 2092.380000 0.0 684.0 -1.0 Individual -1.0 -1.00 249.0 217780.0 1.0 2.0 0.0 2.0 66.0 1.0 3.0 57200.0 0.0 0.0 5.0 9899.0 0.0 3.0 -1.0 17.0 0.0 34.0 -1.0 34 150 52000 0.0 28116 49 2092 0 9899 24 3 4 2 2 1 2 1 1 2 3 2 2 4 -1 -1 3 1 -1 2 1 1 1 2 1 2 4
107861 36 months 196.18 10+ years MORTGAGE 50000.00 Source Verified 2018-01-01 debt_consolidation NH 28.93 0.0 1997-06-01 690.0 0.0 58.0 -1.0 11.0 0.0 6950.0 51.9 14.0 2742.880000 0.0 819.0 58.0 Individual -1.0 -1.00 0.0 230614.0 0.0 1.0 1.0 1.0 83.0 1.0 1.0 13400.0 0.0 2.0 2.0 20965.0 0.0 2.0 58.0 7.0 1.0 11.0 -1.0 29 196 50000 0.0 6950 52 2743 0 20965 21 2 4 2 2 1 2 1 1 2 2 2 2 4 2 -1 1 1 2 2 1 1 1 1 2 2 3
107862 36 months 389.58 8 years MORTGAGE 36000.00 Verified 2018-01-01 debt_consolidation IN 11.10 1.0 1998-05-01 685.0 0.0 21.0 -1.0 14.0 0.0 11648.0 43.6 18.0 5593.050000 0.0 694.0 21.0 Individual -1.0 -1.00 0.0 191131.0 2.0 1.0 0.0 0.0 -1.0 3.0 6.0 26700.0 0.0 1.0 6.0 14702.0 0.0 1.0 -1.0 11.0 1.0 12.0 -1.0 11 390 36000 0.0 11648 44 5593 0 14702 20 3 3 2 2 2 2 1 1 2 1 2 2 4 1 -1 2 2 1 2 2 1 2 2 1 2 2
107863 36 months 475.71 2 years OWN 80000.00 Source Verified 2018-01-01 car CA 1.35 0.0 2007-07-01 660.0 1.0 31.0 -1.0 11.0 0.0 1461.0 4.1 21.0 14662.947011 0.0 674.0 31.0 Individual -1.0 -1.00 0.0 1461.0 1.0 0.0 0.0 1.0 -1.0 1.0 2.0 35300.0 0.0 2.0 3.0 162.0 0.0 0.0 31.0 0.0 5.0 19.0 -1.0 1 476 80000 0.0 1461 4 14663 0 162 11 1 4 1 2 2 1 1 1 1 2 1 1 3 1 -1 1 3 1 2 1 1 1 2 2 2 1

107864 rows × 85 columns

In [60]:
print(X_train_segmented.shape)
print(X_test_segmented.shape)
(107864, 85)
(42928, 85)
  • Check whether the segmentations are made as intended
In [61]:
# Crosstab with absolute volumes

crosstab_absolute1 = pd.crosstab(X_train_segmented['purpose_segm'], X_train_segmented['purpose'])
crosstab_absolute2 = pd.crosstab(X_train_segmented['term_segm'], X_train_segmented['term'])
crosstab_absolute3 = pd.crosstab(X_train_segmented['years_with_Credit_line_segm'], X_train_segmented['years_with_Credit_line'])
crosstab_absolute4 = pd.crosstab(X_train_segmented['home_ownership_segm'], X_train_segmented['home_ownership'])
crosstab_absolute5 = pd.crosstab(X_train_segmented['application_type_segm'], X_train_segmented['application_type'])
crosstab_absolute6 = pd.crosstab(X_train_segmented['delinq_2yrs_segm'], X_train_segmented['delinq_2yrs'])
crosstab_absolute7 = pd.crosstab(X_train_segmented['mths_since_last_delinq_segm'], X_train_segmented['mths_since_last_delinq'])
crosstab_absolute8 = pd.crosstab(X_train_segmented['mths_since_last_record_segm'], X_train_segmented['mths_since_last_record'])
crosstab_absolute9 = pd.crosstab(X_train_segmented['mths_since_last_major_derog_segm'], X_train_segmented['mths_since_last_major_derog'])
crosstab_absolute10 = pd.crosstab(X_train_segmented['open_rv_24m_segm'], X_train_segmented['open_rv_24m'])
crosstab_absolute11 = pd.crosstab(X_train_segmented['open_rv_12m_segm'], X_train_segmented['open_rv_12m'])
crosstab_absolute12 = pd.crosstab(X_train_segmented['open_acc_6m_segm'], X_train_segmented['open_acc_6m'])
crosstab_absolute13 = pd.crosstab(X_train_segmented['mort_acc_segm'], X_train_segmented['mort_acc'])
crosstab_absolute14 = pd.crosstab(X_train_segmented['emp_length_segm'], X_train_segmented['emp_length'])
crosstab_absolute15 = pd.crosstab(X_train_segmented['open_il_12m_segm'], X_train_segmented['open_il_12m'])
crosstab_absolute16 = pd.crosstab(X_train_segmented['mths_since_recent_inq_segm'], X_train_segmented['mths_since_recent_inq'])
crosstab_absolute17 = pd.crosstab(X_train_segmented['inq_last_6mths_segm'], X_train_segmented['inq_last_6mths'])
crosstab_absolute18 = pd.crosstab(X_train_segmented['inq_last_12m_segm'], X_train_segmented['inq_last_12m'])
crosstab_absolute19 = pd.crosstab(X_train_segmented['fico_range_low_segm'], X_train_segmented['fico_range_low'])
crosstab_absolute20 = pd.crosstab(X_train_segmented['acc_open_past_24mths_segm'], X_train_segmented['acc_open_past_24mths'])
crosstab_absolute21 = pd.crosstab(X_train_segmented['verification_status_segm'], X_train_segmented['verification_status'])

# Continuous variables
crosstab_absolute22 = pd.crosstab(X_train_segmented['revol_bal_segm'], X_train_segmented['revol_bal'])
crosstab_absolute23 = pd.crosstab(X_train_segmented['Annual_Inc_round_segm'], X_train_segmented['Annual_Inc_round'])
# crosstab_absolute24 = pd.crosstab(X_train_segmented['out_prncp_round_segm'], X_train_segmented['out_prncp_round'])
crosstab_absolute25 = pd.crosstab(X_train_segmented['total_pymnt_round_segm'], X_train_segmented['total_pymnt_round'])
crosstab_absolute26 = pd.crosstab(X_train_segmented['instlmnt_round_segm'], X_train_segmented['instlmnt_round'])
crosstab_absolute27 = pd.crosstab(X_train_segmented['dti_rounded_segm'], X_train_segmented['dti_rounded'])

# Display the results
# print("Crosstab with Absolute Volumes:\n", crosstab_absolute)
In [62]:
print("Crosstab with Absolute Volumes:\n", crosstab_absolute1)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute2)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute3)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute4)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute5)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute6)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute7)
Crosstab with Absolute Volumes:
 purpose        car  credit_card  debt_consolidation  home_improvement  house  \
purpose_segm                                                                   
1             1361            0                   0                 0   1579   
2                0            0               55083              7640      0   
3                0        24577                   0                 0      0   

purpose       major_purchase  medical  moving  other  renewable_energy  \
purpose_segm                                                             
1                       3217     1765     745      0                 0   
2                          0        0       0   9734                63   
3                          0        0       0      0                 0   

purpose       small_business  vacation  wedding  
purpose_segm                                     
1                       1346         0        0  
2                          0       749        5  
3                          0         0        0  
Crosstab with Absolute Volumes:
 term       36 months  60 months
term_segm                      
1                  0      32452
2              75412          0
Crosstab with Absolute Volumes:
 years_with_Credit_line        3     4     5     6     7     8     9     10  \
years_with_Credit_line_segm                                                  
1                              0     0     0     0     0     0     0  3909   
2                            143  2262  2403  2530  2869  2644  2551     0   

years_with_Credit_line         11    12    13    14    15    16    17    18  \
years_with_Credit_line_segm                                                   
1                            6547  7936  7936  7453  6729  5653  5297  4902   
2                               0     0     0     0     0     0     0     0   

years_with_Credit_line         19    20    21    22    23    24    25    26  \
years_with_Credit_line_segm                                                   
1                               0     0     0     0     0     0     0     0   
2                            4267  3673  3209  3088  2914  2538  2115  1499   

years_with_Credit_line         27    28    29    30    31   32   33   34   35  \
years_with_Credit_line_segm                                                     
1                               0     0     0     0     0    0    0    0    0   
2                            1411  1465  1413  1146  1132  907  839  780  631   

years_with_Credit_line        36   37   38   39   40   41   42   43   44   45  \
years_with_Credit_line_segm                                                     
1                              0    0    0    0  247  261  189  143  148  126   
2                            493  354  266  335    0    0    0    0    0    0   

years_with_Credit_line       46  47  48  49  50  51  52  53  54  55  56  57  \
years_with_Credit_line_segm                                                   
1                            93  72  63  64  54  48  26  28  25  13   6   5   
2                             0   0   0   0   0   0   0   0   0   0   0   0   

years_with_Credit_line       58  59  60  62  65  67  68  
years_with_Credit_line_segm                              
1                             4   4   1   1   1   2   1  
2                             0   0   0   0   0   0   0  
Crosstab with Absolute Volumes:
 home_ownership       ANY  MORTGAGE    OWN   RENT
home_ownership_segm                             
1                      4         0  14010      0
2                      0     51874      0  41976
Crosstab with Absolute Volumes:
 application_type       Individual  Joint App
application_type_segm                       
-1                              0      16331
 2                          91533          0
Crosstab with Absolute Volumes:
 delinq_2yrs        0.0    1.0   2.0   3.0   4.0   5.0   6.0   7.0   8.0   \
delinq_2yrs_segm                                                           
1                     0      0     0   902   419   193   108    54    41   
2                     0      0  2727     0     0     0     0     0     0   
3                     0  10916     0     0     0     0     0     0     0   
4                 92407      0     0     0     0     0     0     0     0   

delinq_2yrs       9.0   10.0  11.0  12.0  13.0  14.0  15.0  16.0  17.0  19.0  \
delinq_2yrs_segm                                                               
1                   26    32    15     9     3     3     2     1     2     2   
2                    0     0     0     0     0     0     0     0     0     0   
3                    0     0     0     0     0     0     0     0     0     0   
4                    0     0     0     0     0     0     0     0     0     0   

delinq_2yrs       20.0  
delinq_2yrs_segm        
1                    2  
2                    0  
3                    0  
4                    0  
Crosstab with Absolute Volumes:
 mths_since_last_delinq       -1.0     0.0     1.0     2.0     3.0     4.0    \
mths_since_last_delinq_segm                                                   
-1                            60695       0       0       0       0       0   
 1                                0       9      91     237     449     553   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        5.0     6.0     7.0     8.0     9.0     10.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              564     718     700     628     639     629   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        11.0    12.0    13.0    14.0    15.0    16.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              722     910     912     810     751     755   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        17.0    18.0    19.0    20.0    21.0    22.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              686     767     790     667     686     721   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        23.0    24.0    25.0    26.0    27.0    28.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              793     884     912     857     765     771   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        29.0    30.0    31.0    32.0    33.0    34.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              790     745     645     688     677     714   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        35.0    36.0    37.0    38.0    39.0    40.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              669     748     802     755     691     616   
 2                                0       0       0       0       0       0   

mths_since_last_delinq        41.0    42.0    43.0    44.0    45.0    46.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                              667     631     630     607     612       0   
 2                                0       0       0       0       0     613   

mths_since_last_delinq        47.0    48.0    49.0    50.0    51.0    52.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              679     683     579     495     431     478   

mths_since_last_delinq        53.0    54.0    55.0    56.0    57.0    58.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              433     419     429     410     419     449   

mths_since_last_delinq        59.0    60.0    61.0    62.0    63.0    64.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              473     490     495     481     458     417   

mths_since_last_delinq        65.0    66.0    67.0    68.0    69.0    70.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              412     432     454     397     390     394   

mths_since_last_delinq        71.0    72.0    73.0    74.0    75.0    76.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              403     411     433     407     412     364   

mths_since_last_delinq        77.0    78.0    79.0    80.0    81.0    82.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              320     333     328     276     228     126   

mths_since_last_delinq        83.0    84.0    85.0    86.0    87.0    88.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                               48      23      12       5       6      12   

mths_since_last_delinq        89.0    90.0    91.0    92.0    93.0    94.0   \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                                7       5       7       5       4       6   

mths_since_last_delinq        95.0    96.0    97.0    98.0    99.0    100.0  \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                                5       4       8       4       7       4   

mths_since_last_delinq        101.0   102.0   103.0   104.0   105.0   106.0  \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                               11       3       8       4       3       2   

mths_since_last_delinq        107.0   108.0   109.0   110.0   111.0   112.0  \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                                3       5       3       5       4       7   

mths_since_last_delinq        113.0   114.0   115.0   116.0   118.0   119.0  \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                                2       1       1       2       3       1   

mths_since_last_delinq        120.0   122.0   125.0   126.0   130.0   131.0  \
mths_since_last_delinq_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                                2       1       1       2       2       1   

mths_since_last_delinq        133.0   138.0   156.0   158.0   160.0   226.0  
mths_since_last_delinq_segm                                                  
-1                                0       0       0       0       0       0  
 1                                0       0       0       0       0       0  
 2                                1       1       1       1       1       1  
In [63]:
print("Crosstab with Absolute Volumes:\n", crosstab_absolute8)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute9)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute10)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute11)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute12)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute13)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute14)

print("Crosstab with Absolute Volumes:\n", crosstab_absolute15)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute16)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute17)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute18)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute19)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute20)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute21)
Crosstab with Absolute Volumes:
 mths_since_last_record       -1.0     1.0     2.0     3.0     4.0     5.0    \
mths_since_last_record_segm                                                   
-1                            92595       0       0       0       0       0   
 1                                0       3       2      13      12      15   
 2                                0       0       0       0       0       0   

mths_since_last_record        6.0     7.0     8.0     9.0     10.0    11.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               16      17      27      25      24      21   
 2                                0       0       0       0       0       0   

mths_since_last_record        12.0    13.0    14.0    15.0    16.0    17.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               24      24      22      28      22      32   
 2                                0       0       0       0       0       0   

mths_since_last_record        18.0    19.0    20.0    21.0    22.0    23.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               26      30      24      25      30      35   
 2                                0       0       0       0       0       0   

mths_since_last_record        24.0    25.0    26.0    27.0    28.0    29.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               24      28      22      30      38      49   
 2                                0       0       0       0       0       0   

mths_since_last_record        30.0    31.0    32.0    33.0    34.0    35.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               49      36      63      47      55      54   
 2                                0       0       0       0       0       0   

mths_since_last_record        36.0    37.0    38.0    39.0    40.0    41.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               40      42      61      64      62      60   
 2                                0       0       0       0       0       0   

mths_since_last_record        42.0    43.0    44.0    45.0    46.0    47.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                               81      70      78      93      85     106   
 2                                0       0       0       0       0       0   

mths_since_last_record        48.0    49.0    50.0    51.0    52.0    53.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                              105      87      94      74      91     102   
 2                                0       0       0       0       0       0   

mths_since_last_record        54.0    55.0    56.0    57.0    58.0    59.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                              128     125     122     143     161     145   
 2                                0       0       0       0       0       0   

mths_since_last_record        60.0    61.0    62.0    63.0    64.0    65.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                              121     129     122     161     154     159   
 2                                0       0       0       0       0       0   

mths_since_last_record        66.0    67.0    68.0    69.0    70.0    71.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                              152     165     143       0       0       0   
 2                                0       0       0     182     200     191   

mths_since_last_record        72.0    73.0    74.0    75.0    76.0    77.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              181     169     208     200     207     218   

mths_since_last_record        78.0    79.0    80.0    81.0    82.0    83.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              224     246     233     233     296     248   

mths_since_last_record        84.0    85.0    86.0    87.0    88.0    89.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              270     255     230     252     260     269   

mths_since_last_record        90.0    91.0    92.0    93.0    94.0    95.0   \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              245     263     270     267     299     240   

mths_since_last_record        96.0    97.0    98.0    99.0    100.0   101.0  \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              224     218     187     255     224     229   

mths_since_last_record        102.0   103.0   104.0   105.0   106.0   107.0  \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              238     209     258     248     218     224   

mths_since_last_record        108.0   109.0   110.0   111.0   112.0   113.0  \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              163     166     169     163     159     140   

mths_since_last_record        114.0   115.0   116.0   117.0   118.0   119.0  \
mths_since_last_record_segm                                                   
-1                                0       0       0       0       0       0   
 1                                0       0       0       0       0       0   
 2                              151     134     115     128     116      30   

mths_since_last_record        120.0   121.0   122.0   123.0   124.0  
mths_since_last_record_segm                                          
-1                                0       0       0       0       0  
 1                                0       0       0       0       0  
 2                               10       4       1      13       2  
Crosstab with Absolute Volumes:
 mths_since_last_major_derog       -1.0     0.0     1.0     2.0     3.0    \
mths_since_last_major_derog_segm                                           
-1                                 83103       0       0       0       0   
 1                                     0       9      11      18      69   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        4.0     5.0     6.0     7.0     8.0    \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                    97     109     137     148     154   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        9.0     10.0    11.0    12.0    13.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   174     184     190     239     250   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        14.0    15.0    16.0    17.0    18.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   255     218     221     219     271   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        19.0    20.0    21.0    22.0    23.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   247     233     247     268     270   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        24.0    25.0    26.0    27.0    28.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   336     373     306     284     341   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        29.0    30.0    31.0    32.0    33.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   300     355     307     348     334   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        34.0    35.0    36.0    37.0    38.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   396     366     410     420     405   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        39.0    40.0    41.0    42.0    43.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   353     349     364     331     364   
 2                                     0       0       0       0       0   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        44.0    45.0    46.0    47.0    48.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                   322     375       0       0       0   
 2                                     0       0     371     406     434   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        49.0    50.0    51.0    52.0    53.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   398     360     309     350     353   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        54.0    55.0    56.0    57.0    58.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   362     350     328     362     415   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        59.0    60.0    61.0    62.0    63.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   402     394     379     356     384   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        64.0    65.0    66.0    67.0    68.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   361     344     349     396     364   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        69.0    70.0    71.0    72.0    73.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   329     319     351     322     359   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        74.0    75.0    76.0    77.0    78.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                   339     332     275     245     235   
 3                                     0       0       0       0       0   

mths_since_last_major_derog        79.0    80.0    81.0    82.0    83.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                   247     194     165     100      44   

mths_since_last_major_derog        84.0    85.0    86.0    87.0    88.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                    22      16      12      15      23   

mths_since_last_major_derog        89.0    90.0    91.0    92.0    93.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                    12      11      16      11      11   

mths_since_last_major_derog        94.0    95.0    96.0    97.0    98.0   \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     8      10       6      16      12   

mths_since_last_major_derog        99.0    100.0   101.0   102.0   103.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                    15       8      21       8      14   

mths_since_last_major_derog        104.0   105.0   106.0   107.0   108.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     9       7       9       9       8   

mths_since_last_major_derog        109.0   110.0   111.0   112.0   113.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     8       8       9      10       5   

mths_since_last_major_derog        114.0   115.0   116.0   117.0   118.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     5       6       2       3       5   

mths_since_last_major_derog        119.0   120.0   121.0   122.0   123.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     1       5       1       2       2   

mths_since_last_major_derog        125.0   126.0   128.0   130.0   131.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     2       2       2       2       1   

mths_since_last_major_derog        132.0   133.0   138.0   139.0   145.0  \
mths_since_last_major_derog_segm                                           
-1                                     0       0       0       0       0   
 1                                     0       0       0       0       0   
 2                                     0       0       0       0       0   
 3                                     2       1       1       1       1   

mths_since_last_major_derog        153.0   156.0   158.0   160.0   226.0  
mths_since_last_major_derog_segm                                          
-1                                     0       0       0       0       0  
 1                                     0       0       0       0       0  
 2                                     0       0       0       0       0  
 3                                     1       1       1       1       1  
Crosstab with Absolute Volumes:
 open_rv_24m        0.0    1.0    2.0    3.0    4.0   5.0   6.0   7.0   8.0   \
open_rv_24m_segm                                                              
1                 18931  24411      0      0      0     0     0     0     0   
2                     0      0  20872  14872  10004  6704  4209     0     0   
3                     0      0      0      0      0     0     0  2732  1719   

open_rv_24m       9.0   10.0  11.0  12.0  13.0  14.0  15.0  16.0  17.0  18.0  \
open_rv_24m_segm                                                               
1                    0     0     0     0     0     0     0     0     0     0   
2                    0     0     0     0     0     0     0     0     0     0   
3                 1124   721   512   318   213   166   101    67    55    43   

open_rv_24m       19.0  20.0  21.0  22.0  23.0  24.0  25.0  26.0  27.0  29.0  \
open_rv_24m_segm                                                               
1                    0     0     0     0     0     0     0     0     0     0   
2                    0     0     0     0     0     0     0     0     0     0   
3                   21    17    13    12     6     3     6     4     1     4   

open_rv_24m       34.0  35.0  38.0  
open_rv_24m_segm                    
1                    0     0     0  
2                    0     0     0  
3                    1     1     1  
Crosstab with Absolute Volumes:
 open_rv_12m        0.0    1.0    2.0   3.0   4.0   5.0   6.0   7.0   8.0   \
open_rv_12m_segm                                                            
1                 42323  31776      0     0     0     0     0     0     0   
2                     0      0  17357  8443  3980  2042   971   446   243   

open_rv_12m       9.0   10.0  11.0  12.0  13.0  14.0  15.0  16.0  17.0  18.0  \
open_rv_12m_segm                                                               
1                    0     0     0     0     0     0     0     0     0     0   
2                  133    59    38    20    16     6     4     3     1     1   

open_rv_12m       19.0  
open_rv_12m_segm        
1                    0  
2                    2  
Crosstab with Absolute Volumes:
 open_acc_6m        0.0    1.0    2.0   3.0   4.0   5.0   6.0   7.0   8.0   \
open_acc_6m_segm                                                            
1                 49996  33005      0     0     0     0     0     0     0   
2                     0      0  15358  6116  2180   757   254   113    57   

open_acc_6m       9.0   10.0  12.0  
open_acc_6m_segm                    
1                    0     0     0  
2                   21     4     3  
Crosstab with Absolute Volumes:
 mort_acc        0.0    1.0    2.0    3.0   4.0   5.0   6.0   7.0   8.0   9.0   \
mort_acc_segm                                                                   
1              48091      0      0      0     0     0     0     0     0     0   
2                  0  19449      0      0     0     0     0     0     0     0   
3                  0      0  16283      0     0     0     0     0     0     0   
4                  0      0      0  10988  6473  3385  1647   797   371   171   

mort_acc       10.0  11.0  12.0  13.0  14.0  15.0  16.0  17.0  18.0  22.0  \
mort_acc_segm                                                               
1                 0     0     0     0     0     0     0     0     0     0   
2                 0     0     0     0     0     0     0     0     0     0   
3                 0     0     0     0     0     0     0     0     0     0   
4                98    44    22    14     7     7     4     2     2     2   

mort_acc       23.0  24.0  25.0  31.0  46.0  
mort_acc_segm                                
1                 0     0     0     0     0  
2                 0     0     0     0     0  
3                 0     0     0     0     0  
4                 2     2     1     1     1  
Crosstab with Absolute Volumes:
 emp_length       1 year  10+ years  2 years  3 years  4 years  5 years  \
emp_length_segm                                                          
-1                    0          0        0        0        0        0   
 1                    0      35706        0        0        0        0   
 2                 7169          0    10191     9179     6918     6815   

emp_length       6 years  7 years  8 years  9 years  < 1 year  MISSING  
emp_length_segm                                                         
-1                     0        0        0        0         0     9428  
 1                     0        0        0     3123         0        0  
 2                  4716     4002     3278        0      7339        0  
Crosstab with Absolute Volumes:
 open_il_12m         0.0    1.0    2.0   3.0   4.0  5.0  6.0  8.0
open_il_12m_segm                                                
1                 60679  31424      0     0     0    0    0    0
2                     0      0  10938  3261  1092  350  119    1
Crosstab with Absolute Volumes:
 mths_since_recent_inq       -1.0    0.0    1.0    2.0    3.0    4.0    5.0   \
mths_since_recent_inq_segm                                                    
-1                          13648      0      0      0      0      0      0   
 1                              0   7360   8716   8012      0      0      0   
 2                              0      0      0      0   7155   6805   6183   
 3                              0      0      0      0      0      0      0   

mths_since_recent_inq        6.0    7.0    8.0    9.0    10.0   11.0   12.0  \
mths_since_recent_inq_segm                                                    
-1                              0      0      0      0      0      0      0   
 1                              0      0      0      0      0      0      0   
 2                           5784   5741   4902   4178   3788      0      0   
 3                              0      0      0      0      0   3168   2887   

mths_since_recent_inq        13.0   14.0   15.0   16.0   17.0   18.0   19.0  \
mths_since_recent_inq_segm                                                    
-1                              0      0      0      0      0      0      0   
 1                              0      0      0      0      0      0      0   
 2                              0      0      0      0      0      0      0   
 3                           2813   2636   2174   2025   1852   1771   1586   

mths_since_recent_inq        20.0   21.0   22.0   23.0   24.0  
mths_since_recent_inq_segm                                     
-1                              0      0      0      0      0  
 1                              0      0      0      0      0  
 2                              0      0      0      0      0  
 3                           1289   1130    985    859    417  
Crosstab with Absolute Volumes:
 inq_last_6mths         0.0    1.0   2.0   3.0  4.0  5.0
inq_last_6mths_segm                                    
1                        0      0     0  2492  132   36
2                        0      0  7996     0    0    0
3                        0  25578     0     0    0    0
4                    71630      0     0     0    0    0
Crosstab with Absolute Volumes:
 inq_last_12m        0.0    1.0    2.0    3.0   4.0   5.0   6.0   7.0   8.0   \
inq_last_12m_segm                                                             
1                  33209  26406      0      0     0     0     0     0     0   
2                      0      0  17422  11290  7094  4358  2800     0     0   
3                      0      0      0      0     0     0     0  1718  1165   

inq_last_12m       9.0   10.0  11.0  12.0  13.0  14.0  15.0  16.0  17.0  18.0  \
inq_last_12m_segm                                                               
1                     0     0     0     0     0     0     0     0     0     0   
2                     0     0     0     0     0     0     0     0     0     0   
3                   761   478   368   251   154   104    65    49    42    41   

inq_last_12m       19.0  20.0  21.0  22.0  23.0  24.0  25.0  26.0  27.0  29.0  \
inq_last_12m_segm                                                               
1                     0     0     0     0     0     0     0     0     0     0   
2                     0     0     0     0     0     0     0     0     0     0   
3                    25    14     9    11     2     4     3     6     3     3   

inq_last_12m       30.0  31.0  33.0  39.0  40.0  45.0  46.0  
inq_last_12m_segm                                            
1                     0     0     0     0     0     0     0  
2                     0     0     0     0     0     0     0  
3                     1     2     1     2     1     1     1  
Crosstab with Absolute Volumes:
 fico_range_low       660.0  665.0  670.0  675.0  680.0  685.0  690.0  695.0  \
fico_range_low_segm                                                           
1                     6326   6339   6452   5991   6613      0      0      0   
2                        0      0      0      0      0   6128   6083   5772   
3                        0      0      0      0      0      0      0      0   
4                        0      0      0      0      0      0      0      0   
5                        0      0      0      0      0      0      0      0   

fico_range_low       700.0  705.0  710.0  715.0  720.0  725.0  730.0  735.0  \
fico_range_low_segm                                                           
1                        0      0      0      0      0      0      0      0   
2                     5875      0      0      0      0      0      0      0   
3                        0   5542   5173   4700   4674      0      0      0   
4                        0      0      0      0      0   3849   3650   2965   
5                        0      0      0      0      0      0      0      0   

fico_range_low       740.0  745.0  750.0  755.0  760.0  765.0  770.0  775.0  \
fico_range_low_segm                                                           
1                        0      0      0      0      0      0      0      0   
2                        0      0      0      0      0      0      0      0   
3                        0      0      0      0      0      0      0      0   
4                     2929   2319   2176   1890   1674   1607   1339   1267   
5                        0      0      0      0      0      0      0      0   

fico_range_low       780.0  785.0  790.0  795.0  800.0  805.0  810.0  815.0  \
fico_range_low_segm                                                           
1                        0      0      0      0      0      0      0      0   
2                        0      0      0      0      0      0      0      0   
3                        0      0      0      0      0      0      0      0   
4                     1160    899    868      0      0      0      0      0   
5                        0      0      0    672    665    579    442    367   

fico_range_low       820.0  825.0  830.0  835.0  840.0  845.0  
fico_range_low_segm                                            
1                        0      0      0      0      0      0  
2                        0      0      0      0      0      0  
3                        0      0      0      0      0      0  
4                        0      0      0      0      0      0  
5                      291    262    141     88     54     43  
Crosstab with Absolute Volumes:
 acc_open_past_24mths       0.0    1.0    2.0    3.0    4.0    5.0   6.0   \
acc_open_past_24mths_segm                                                  
1                          5580  11992      0      0      0      0     0   
2                             0      0  16061  16273  14819  11841  9159   
3                             0      0      0      0      0      0     0   

acc_open_past_24mths       7.0   8.0   9.0   10.0  11.0  12.0  13.0  14.0  \
acc_open_past_24mths_segm                                                   
1                             0     0     0     0     0     0     0     0   
2                             0     0     0     0     0     0     0     0   
3                          6563  4709  3377  2375  1553  1137   746   515   

acc_open_past_24mths       15.0  16.0  17.0  18.0  19.0  20.0  21.0  22.0  \
acc_open_past_24mths_segm                                                   
1                             0     0     0     0     0     0     0     0   
2                             0     0     0     0     0     0     0     0   
3                           344   228   192   119    78    47    40    36   

acc_open_past_24mths       23.0  24.0  25.0  26.0  27.0  28.0  29.0  30.0  \
acc_open_past_24mths_segm                                                   
1                             0     0     0     0     0     0     0     0   
2                             0     0     0     0     0     0     0     0   
3                            20    16    13     6     6     4     5     4   

acc_open_past_24mths       31.0  33.0  35.0  36.0  37.0  38.0  
acc_open_past_24mths_segm                                      
1                             0     0     0     0     0     0  
2                             0     0     0     0     0     0  
3                             1     1     1     1     1     1  
Crosstab with Absolute Volumes:
 verification_status       Not Verified  Source Verified  Verified
verification_status_segm                                         
1                                    0                0     24769
2                                    0            41644         0
3                                41451                0         0
  • Calculate and print the IVs
In [64]:
# X_train_segmented is the dataset with segmented variables
# target_variable is the target variable with default information
iv_df = calculate_iv(X_train_segmented, y_train)
print(iv_df)
                            Variable         IV
16            total_pymnt_round_segm  55.609418
0         mths_since_recent_inq_segm   7.757868
9           verification_status_segm   7.537108
12               inq_last_6mths_segm   7.337686
2                fico_range_low_segm   6.496426
23                 inq_last_12m_segm   4.925817
3                          term_segm   4.768724
10                      purpose_segm   3.344770
7                instlmnt_round_segm   3.042972
25                     mort_acc_segm   2.967846
4                    emp_length_segm   2.744972
19                  open_acc_6m_segm   2.630914
24         acc_open_past_24mths_segm   2.506123
22                  open_rv_24m_segm   2.240605
21                  open_rv_12m_segm   2.041305
15                    revol_bal_segm   1.635041
6              Annual_Inc_round_segm   1.607459
17  mths_since_last_major_derog_segm   1.044393
20                  open_il_12m_segm   0.756474
1                   delinq_2yrs_segm   0.495526
13       mths_since_last_delinq_segm   0.396014
14       mths_since_last_record_segm   0.387369
5                   dti_rounded_segm   0.372996
18             application_type_segm   0.369479
11       years_with_Credit_line_segm   0.103906
8                home_ownership_segm   0.062790
  • Visualization of the patterns from the segmented characteristics
In [65]:
# Mapping dictionary for legends
mapping_dict = {
    'mths_since_recent_inq_segm': {
        '0-2 months': 1,
        '3-10 months': 2,
        '11+ months': 3,
        'MISSING': -1
    },
    'delinq_2yrs_segm': {
        '3+ delinquencies': 1,
        '2 delinquencies': 2,
        '1 delinquency': 3,
        '0 delinquencies': 4
    },
    'fico_range_low_segm': {
        '<=680': 1,
        '685-700': 2,
        '705-720': 3,
        '725-790': 4,
        '791+': 5
    },
    'term_segm': {
        '60 months': 1,
        '36 months': 2
    },
    'emp_length_segm': {
        '10+ years': 1,
        '9 years': 1,
        '1-8 years': 2,
        '< 1 year': 2,
        'MISSING': -1
    },
    'dti_rounded_segm': {
        '0-7': 1,
        '8+': 2,
        'MISSING': -1
    },
    'Annual_Inc_round_segm': {
        '<=84999': 1,
        '85000-109999': 2,
        '110000-214999': 3,
        '215000+': 4
    },
    'instlmnt_round_segm': {
        '<=479': 1,
        '480-699': 2,
        '700-879': 3,
        '880+': 4
    },
    'home_ownership_segm': {
        'ANY, OWN': 1,
        'MORTGAGE, RENT': 2
    },
    'verification_status_segm': {
        'Verified': 1,
        'Source Verified': 2,
        'Not Verified': 3
    },
    'purpose_segm': {
        'car, house, major_purchase, medical, moving, small_business': 1,
        'debt_consolidation, home_improvement, other, renewable_energy, vacation, wedding': 2,
        'credit_card': 3
    },
    'years_with_Credit_line_segm': {
        '10-18 years': 1,
        '40+ years': 1,
        'Other': 2
    },
    'inq_last_6mths_segm': {
        '3+ inquiries': 1,
        '2 inquiries': 2,
        '1 inquiry': 3,
        '0 inquiries': 4
    },
    'mths_since_last_delinq_segm': {
        '0-45 months': 1,
        '46+': 2,
        'MISSING': -1
    },
    'mths_since_last_record_segm': {
        '1-68 months': 1,
        '69+ months': 2,
        'MISSING': -1
    },
    'revol_bal_segm': {
        '<=9999': 1,
        '10000-19999': 2,
        '20000-39999': 3,
        '40000+': 4
    },
#     'out_prncp_round_segm': {
#         '>0': 1,
#         '0': 2
#     },
    'total_pymnt_round_segm': {
        '<=4999': 1,
        '5000-9999': 2,
        '10000-14999': 3,
        '15000+': 4
    },
    'mths_since_last_major_derog_segm': {
        '0-45 months': 1,
        '46-78 months': 2,
        '79+ months': 3,
        'MISSING': -1
    },
    'application_type_segm': {
        'Joint_App': 1,
        'Individual': 2
    },
    'open_acc_6m_segm': {
        '<=1': 1,
        '>1': 2,
        'MISSING': -1
    },
    'open_il_12m_segm': {
        '<=1': 1,
        '>1': 2,
        'MISSING': -1
    },
    'open_rv_12m_segm': {
        '<=1': 1,
        '>1': 2,
        'MISSING': -1
    },
    'open_rv_24m_segm': {
        '<=1': 1,
        '2-6': 2,
        '7+': 3,
        'MISSING': -1
    },
    'inq_last_12m_segm': {
        '<=1': 1,
        '2-6': 2,
        '7+': 3,
        'MISSING': -1
    },
    'acc_open_past_24mths_segm': {
        '<=1': 1,
        '2-6': 2,
        '7+': 3,
        'MISSING': -1
    },
    'mort_acc_segm': {
        '0': 1,
        '1': 2,
        '2': 3,
        '3+': 4
    }
}

# List of segmented variables
segmented_vars = [
#     'out_prncp_round_segm',
    'total_pymnt_round_segm',
    'verification_status_segm',
    'mths_since_recent_inq_segm',
    'inq_last_6mths_segm',
    'inq_last_12m_segm',
    'fico_range_low_segm',
    'purpose_segm',
    'instlmnt_round_segm',
    'acc_open_past_24mths_segm',
    'term_segm',
    'open_rv_24m_segm',
    'open_acc_6m_segm',
    'mort_acc_segm',
    'open_rv_12m_segm',
    'revol_bal_segm',
    'emp_length_segm',
    'open_il_12m_segm',
    'mths_since_last_major_derog_segm',
    'Annual_Inc_round_segm',
    'dti_rounded_segm',
    'mths_since_last_record_segm',
    'mths_since_last_delinq_segm',
    'delinq_2yrs_segm',
    'application_type_segm',
    'home_ownership_segm',
    'years_with_Credit_line_segm'
]

# concat as to have a single df
train_data = X_train_segmented.copy()
train_data['Defaulted'] = y_train

# Define color palette
color_palette = sns.color_palette("hsv", 10)

# Function to create bar plots for default rate
def plot_default_rate(segment):
    default_rates = train_data.groupby(segment)['Defaulted'].mean().reset_index()
    default_rates['Defaulted'] = default_rates['Defaulted'] * 100  # Convert to percentage
    
    # Calculate the percentage of the total population for each segment
    population_percentage = train_data[segment].value_counts(normalize=True).reset_index()
    population_percentage.columns = [segment, 'PopulationPercentage']
    population_percentage['PopulationPercentage'] = population_percentage['PopulationPercentage'] * 100  # Convert to percentage

    # Merge the default rates with the population percentages
    merged_data = pd.merge(default_rates, population_percentage, on=segment)

    # Create a color map based on the mapping dictionary
    unique_vals = merged_data[segment].astype(str).unique()
    mapping = mapping_dict.get(segment, {})
    color_map = {str(val): color_palette[idx % len(color_palette)] for idx, val in enumerate(unique_vals)}

    plt.figure(figsize=(12, 6))
    bar_plot = sns.barplot(x=segment, y='Defaulted', data=merged_data, palette=color_map)
    plt.title(f'Default Rate by {segment}')
    plt.xlabel(segment)
    plt.ylabel('Default Rate (%)')

    # Add percentage of the total population as text annotations
    for index, row in merged_data.iterrows():
        bar_plot.text(index, row['Defaulted'] / 2, f'{row["PopulationPercentage"]:.2f}%', color='black', ha="center", va="center")

    # Create legend
    handles = [plt.Line2D([0], [0], color=color_map.get(str(v), 'grey'), lw=4, label=f'{k}: {v}') for k, v in mapping.items()]
    plt.legend(handles=handles, title='Mappings', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.show()

# Create plots for each segmented variable
for var in segmented_vars:
    plot_default_rate(var)
In [66]:
for var in segmented_vars:
    print(f"{var}: {X_train_segmented[var].dtype}")
total_pymnt_round_segm: int32
verification_status_segm: int32
mths_since_recent_inq_segm: int32
inq_last_6mths_segm: int32
inq_last_12m_segm: int32
fico_range_low_segm: int64
purpose_segm: int32
instlmnt_round_segm: int32
acc_open_past_24mths_segm: int32
term_segm: int32
open_rv_24m_segm: int32
open_acc_6m_segm: int32
mort_acc_segm: int32
open_rv_12m_segm: int32
revol_bal_segm: int32
emp_length_segm: int32
open_il_12m_segm: int32
mths_since_last_major_derog_segm: int32
Annual_Inc_round_segm: int32
dti_rounded_segm: int32
mths_since_last_record_segm: int32
mths_since_last_delinq_segm: int32
delinq_2yrs_segm: int32
application_type_segm: int32
home_ownership_segm: int32
years_with_Credit_line_segm: int32
  • Create dummy variables for all the segmented ones
In [67]:
from feature_engine.encoding import OneHotEncoder
dummy_encoder = OneHotEncoder(top_categories=None,
        variables=segmented_vars,  # we select which variables to encode
        drop_last=True) # since we have linear model, I want to drop 1 category from each variable. For n categories, n-1 
                        ## dummy variables will be created
    

# wll cast them as categorical, in order to be compatible with the encoder

# Keep the segmented vars only!
X_train_segmented_final = X_train_segmented[segmented_vars]
X_test_segmented_final = X_test_segmented[segmented_vars]

for var in segmented_vars:
    X_train_segmented_final[var] = X_train_segmented_final[var].astype('category')

for var in segmented_vars:
    X_test_segmented_final[var] = X_test_segmented_final[var].astype('category')

# Fittarw only sto the training set
dummy_encoder.fit(X_train_segmented_final)
Out[67]:
OneHotEncoder(drop_last=True,
              variables=['total_pymnt_round_segm', 'verification_status_segm',
                         'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
                         'inq_last_12m_segm', 'fico_range_low_segm',
                         'purpose_segm', 'instlmnt_round_segm',
                         'acc_open_past_24mths_segm', 'term_segm',
                         'open_rv_24m_segm', 'open_acc_6m_segm',
                         'mort_acc_segm', 'open_rv_12m_segm', 'revol_bal_segm',
                         'emp_length_segm', 'open_il_12m_segm',
                         'mths_since_last_major_derog_segm',
                         'Annual_Inc_round_segm', 'dti_rounded_segm',
                         'mths_since_last_record_segm',
                         'mths_since_last_delinq_segm', 'delinq_2yrs_segm',
                         'application_type_segm', 'home_ownership_segm',
                         'years_with_Credit_line_segm'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OneHotEncoder(drop_last=True,
              variables=['total_pymnt_round_segm', 'verification_status_segm',
                         'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
                         'inq_last_12m_segm', 'fico_range_low_segm',
                         'purpose_segm', 'instlmnt_round_segm',
                         'acc_open_past_24mths_segm', 'term_segm',
                         'open_rv_24m_segm', 'open_acc_6m_segm',
                         'mort_acc_segm', 'open_rv_12m_segm', 'revol_bal_segm',
                         'emp_length_segm', 'open_il_12m_segm',
                         'mths_since_last_major_derog_segm',
                         'Annual_Inc_round_segm', 'dti_rounded_segm',
                         'mths_since_last_record_segm',
                         'mths_since_last_delinq_segm', 'delinq_2yrs_segm',
                         'application_type_segm', 'home_ownership_segm',
                         'years_with_Credit_line_segm'])
In [68]:
X_train_segmented_final
Out[68]:
total_pymnt_round_segm verification_status_segm mths_since_recent_inq_segm inq_last_6mths_segm inq_last_12m_segm fico_range_low_segm purpose_segm instlmnt_round_segm acc_open_past_24mths_segm term_segm open_rv_24m_segm open_acc_6m_segm mort_acc_segm open_rv_12m_segm revol_bal_segm emp_length_segm open_il_12m_segm mths_since_last_major_derog_segm Annual_Inc_round_segm dti_rounded_segm mths_since_last_record_segm mths_since_last_delinq_segm delinq_2yrs_segm application_type_segm home_ownership_segm years_with_Credit_line_segm
0 1 1 1 4 2 1 2 1 2 2 2 1 1 1 1 2 1 1 1 2 -1 1 3 2 2 2
1 2 2 2 4 3 2 2 2 3 2 3 2 4 2 3 2 2 -1 3 2 -1 2 4 2 1 2
2 1 3 1 2 3 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 -1 1 3 -1 2 1
3 2 2 1 3 2 1 2 2 2 1 1 1 3 1 1 1 1 1 1 2 -1 1 3 2 2 2
4 1 2 2 4 2 4 1 1 2 2 1 1 4 1 1 1 1 -1 1 1 -1 1 4 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 3 3 -1 4 1 1 2 1 2 1 1 1 2 1 2 -1 2 1 2 2 -1 1 4 2 2 2
107860 1 3 3 4 1 2 2 1 2 2 2 1 4 1 3 1 1 -1 1 2 -1 -1 4 2 2 2
107861 1 2 2 4 2 2 2 1 2 2 1 1 3 1 1 1 1 2 1 2 -1 2 4 2 2 2
107862 2 1 3 4 1 2 2 1 2 2 2 2 2 2 2 2 1 1 1 2 -1 1 3 2 2 2
107863 3 2 1 3 2 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 -1 1 4 2 1 1

107864 rows × 26 columns

In [69]:
dummy_encoder.variables_
Out[69]:
['total_pymnt_round_segm',
 'verification_status_segm',
 'mths_since_recent_inq_segm',
 'inq_last_6mths_segm',
 'inq_last_12m_segm',
 'fico_range_low_segm',
 'purpose_segm',
 'instlmnt_round_segm',
 'acc_open_past_24mths_segm',
 'term_segm',
 'open_rv_24m_segm',
 'open_acc_6m_segm',
 'mort_acc_segm',
 'open_rv_12m_segm',
 'revol_bal_segm',
 'emp_length_segm',
 'open_il_12m_segm',
 'mths_since_last_major_derog_segm',
 'Annual_Inc_round_segm',
 'dti_rounded_segm',
 'mths_since_last_record_segm',
 'mths_since_last_delinq_segm',
 'delinq_2yrs_segm',
 'application_type_segm',
 'home_ownership_segm',
 'years_with_Credit_line_segm']
In [70]:
# transform training and test sets
X_train_encoded = dummy_encoder.transform(X_train_segmented_final)
X_test_encoded = dummy_encoder.transform(X_test_segmented_final)
X_train_encoded
Out[70]:
total_pymnt_round_segm_1 total_pymnt_round_segm_2 total_pymnt_round_segm_3 verification_status_segm_1 verification_status_segm_2 mths_since_recent_inq_segm_1 mths_since_recent_inq_segm_2 mths_since_recent_inq_segm_-1 inq_last_6mths_segm_4 inq_last_6mths_segm_2 inq_last_6mths_segm_3 inq_last_12m_segm_2 inq_last_12m_segm_3 fico_range_low_segm_1 fico_range_low_segm_2 fico_range_low_segm_4 fico_range_low_segm_5 purpose_segm_2 purpose_segm_1 instlmnt_round_segm_1 instlmnt_round_segm_2 instlmnt_round_segm_3 acc_open_past_24mths_segm_2 acc_open_past_24mths_segm_3 term_segm_2 open_rv_24m_segm_2 open_rv_24m_segm_3 open_acc_6m_segm_1 mort_acc_segm_1 mort_acc_segm_4 mort_acc_segm_3 open_rv_12m_segm_1 revol_bal_segm_1 revol_bal_segm_3 revol_bal_segm_2 emp_length_segm_2 emp_length_segm_1 open_il_12m_segm_1 mths_since_last_major_derog_segm_1 mths_since_last_major_derog_segm_-1 mths_since_last_major_derog_segm_2 Annual_Inc_round_segm_1 Annual_Inc_round_segm_3 Annual_Inc_round_segm_2 dti_rounded_segm_2 dti_rounded_segm_1 mths_since_last_record_segm_-1 mths_since_last_record_segm_2 mths_since_last_delinq_segm_1 mths_since_last_delinq_segm_2 delinq_2yrs_segm_3 delinq_2yrs_segm_4 delinq_2yrs_segm_1 application_type_segm_2 home_ownership_segm_2 years_with_Credit_line_segm_2
0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1
2 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0
3 0 1 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
4 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1
107860 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 1
107861 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1
107862 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
107863 0 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0

107864 rows × 56 columns

¶

Logistic Regression - Credit Score model¶

  • Feature Selection
In [71]:
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pandas as pd

# initialize logistic regression model
log_reg = LogisticRegression(random_state=100)


# Logistic Regression - Forward stepwise selection
sfs_forward = SFS(log_reg,
                  k_features='best', 
                  forward=True,
                  floating=False,
                  scoring='roc_auc',
                  cv=5)

# Fit the SFS to the training data
sfs_log_reg_fitted = sfs_forward.fit(X_train_encoded, y_train)

# Create a DataFrame to store the results
sfs_log_reg_pdf = pd.DataFrame(sfs_log_reg_fitted.subsets_).T
sfs_log_reg_pdf
Out[71]:
feature_idx cv_scores avg_score feature_names
1 (0,) [0.6439882397054446, 0.626075905668313, 0.6218... 0.631047 (total_pymnt_round_segm_1,)
2 (0, 19) [0.7151702714308072, 0.6883516520327987, 0.685... 0.693707 (total_pymnt_round_segm_1, instlmnt_round_segm_1)
3 (0, 1, 19) [0.7074235829754714, 0.704404450915725, 0.7225... 0.719208 (total_pymnt_round_segm_1, total_pymnt_round_s...
4 (0, 1, 19, 20) [0.7812859182667049, 0.7654186900552782, 0.777... 0.775343 (total_pymnt_round_segm_1, total_pymnt_round_s...
5 (0, 1, 19, 20, 24) [0.792806372720185, 0.7797363148019998, 0.7895... 0.788324 (total_pymnt_round_segm_1, total_pymnt_round_s...
6 (0, 1, 8, 19, 20, 24) [0.7972875363010188, 0.7911015291335035, 0.799... 0.797291 (total_pymnt_round_segm_1, total_pymnt_round_s...
7 (0, 1, 8, 13, 19, 20, 24) [0.8012594738989005, 0.7970304717242864, 0.802... 0.802254 (total_pymnt_round_segm_1, total_pymnt_round_s...
8 (0, 1, 3, 8, 13, 19, 20, 24) [0.8056326610862748, 0.8001170012743734, 0.807... 0.80611 (total_pymnt_round_segm_1, total_pymnt_round_s...
9 (0, 1, 3, 8, 13, 19, 20, 21, 24) [0.8104463668898177, 0.8057966379562842, 0.811... 0.810134 (total_pymnt_round_segm_1, total_pymnt_round_s...
10 (0, 1, 3, 8, 13, 19, 20, 21, 24, 28) [0.812859827302234, 0.808408212192469, 0.81275... 0.812424 (total_pymnt_round_segm_1, total_pymnt_round_s...
11 (0, 1, 2, 3, 8, 13, 19, 20, 21, 24, 28) [0.8123589994832174, 0.8083858261967757, 0.815... 0.814345 (total_pymnt_round_segm_1, total_pymnt_round_s...
12 (0, 1, 2, 3, 8, 13, 19, 20, 21, 24, 28, 53) [0.8154586382368225, 0.8107458756084481, 0.817... 0.816312 (total_pymnt_round_segm_1, total_pymnt_round_s...
13 (0, 1, 2, 3, 4, 8, 13, 19, 20, 21, 24, 28, 53) [0.81613084948234, 0.8132797877972238, 0.81846... 0.817912 (total_pymnt_round_segm_1, total_pymnt_round_s...
14 (0, 1, 2, 3, 4, 8, 13, 19, 20, 21, 23, 24, 28,... [0.8181919630466461, 0.8149916991081677, 0.819... 0.819402 (total_pymnt_round_segm_1, total_pymnt_round_s...
15 (0, 1, 2, 3, 4, 8, 13, 14, 19, 20, 21, 23, 24,... [0.8192254027805981, 0.8163605374299574, 0.820... 0.820762 (total_pymnt_round_segm_1, total_pymnt_round_s...
16 (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... [0.8201646541514327, 0.8159830891428601, 0.821... 0.821901 (total_pymnt_round_segm_1, total_pymnt_round_s...
17 (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... [0.8208574220967332, 0.8169434143585887, 0.822... 0.822556 (total_pymnt_round_segm_1, total_pymnt_round_s...
18 (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... [0.8212709555678036, 0.8177132528243932, 0.823... 0.82323 (total_pymnt_round_segm_1, total_pymnt_round_s...
19 (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... [0.8241231797561632, 0.8196227549942066, 0.826... 0.82481 (total_pymnt_round_segm_1, total_pymnt_round_s...
20 (0, 1, 2, 3, 4, 5, 8, 13, 14, 17, 19, 20, 21, ... [0.8242745078658027, 0.8201652619737698, 0.827... 0.825401 (total_pymnt_round_segm_1, total_pymnt_round_s...
21 (0, 1, 2, 3, 4, 5, 8, 13, 14, 17, 18, 19, 20, ... [0.8249949234979195, 0.8217764631458153, 0.829... 0.826552 (total_pymnt_round_segm_1, total_pymnt_round_s...
22 (0, 1, 2, 3, 4, 5, 8, 13, 14, 16, 17, 18, 19, ... [0.8256054825506209, 0.8219440628545791, 0.830... 0.827039 (total_pymnt_round_segm_1, total_pymnt_round_s...
23 (0, 1, 2, 3, 4, 5, 8, 10, 13, 14, 16, 17, 18, ... [0.8261880715989136, 0.8217945007826329, 0.830... 0.827479 (total_pymnt_round_segm_1, total_pymnt_round_s...
24 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 16, 17, 1... [0.8267210595510402, 0.8211445195503647, 0.832... 0.827882 (total_pymnt_round_segm_1, total_pymnt_round_s...
25 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.8279712041462934, 0.8219215158085571, 0.831... 0.828335 (total_pymnt_round_segm_1, total_pymnt_round_s...
26 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.8280176715992001, 0.8225550878017761, 0.832... 0.828645 (total_pymnt_round_segm_1, total_pymnt_round_s...
27 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.8286929806749115, 0.8227898633921007, 0.832... 0.828678 (total_pymnt_round_segm_1, total_pymnt_round_s...
28 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.8287795766680741, 0.8234944406858081, 0.832... 0.82876 (total_pymnt_round_segm_1, total_pymnt_round_s...
29 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.828717512625001, 0.8236664961203333, 0.8322... 0.828843 (total_pymnt_round_segm_1, total_pymnt_round_s...
30 (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... [0.8284529436012258, 0.8242021495136853, 0.832... 0.828895 (total_pymnt_round_segm_1, total_pymnt_round_s...
31 (0, 1, 2, 3, 4, 5, 7, 8, 10, 12, 13, 14, 15, 1... [0.8286129921923936, 0.824072289265287, 0.8330... 0.829011 (total_pymnt_round_segm_1, total_pymnt_round_s...
32 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... [0.8284227173736703, 0.8241185464985941, 0.833... 0.829022 (total_pymnt_round_segm_1, total_pymnt_round_s...
33 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... [0.8286569706372254, 0.8241361725623494, 0.833... 0.829081 (total_pymnt_round_segm_1, total_pymnt_round_s...
34 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... [0.8288580789083605, 0.8238451009348936, 0.832... 0.829077 (total_pymnt_round_segm_1, total_pymnt_round_s...
35 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... [0.8290385588535808, 0.8241533154751186, 0.832... 0.829189 (total_pymnt_round_segm_1, total_pymnt_round_s...
36 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.828690258881909, 0.8241913949306223, 0.8324... 0.829131 (total_pymnt_round_segm_1, total_pymnt_round_s...
37 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8287036171554611, 0.8241492892169003, 0.832... 0.829164 (total_pymnt_round_segm_1, total_pymnt_round_s...
38 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8288726011065521, 0.824220741879413, 0.8329... 0.829187 (total_pymnt_round_segm_1, total_pymnt_round_s...
39 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.828912299890018, 0.8241475355577652, 0.8328... 0.829199 (total_pymnt_round_segm_1, total_pymnt_round_s...
40 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8288817871579359, 0.8245380468159704, 0.832... 0.829251 (total_pymnt_round_segm_1, total_pymnt_round_s...
41 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8284581544023032, 0.8248537949326838, 0.832... 0.829208 (total_pymnt_round_segm_1, total_pymnt_round_s...
42 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8287304769548294, 0.8243444285318767, 0.832... 0.829149 (total_pymnt_round_segm_1, total_pymnt_round_s...
43 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8286825948858225, 0.8247075433397171, 0.832... 0.829059 (total_pymnt_round_segm_1, total_pymnt_round_s...
44 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8285633194700956, 0.8242707569537238, 0.832... 0.829037 (total_pymnt_round_segm_1, total_pymnt_round_s...
45 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.828550408859866, 0.8242014158399654, 0.8328... 0.828942 (total_pymnt_round_segm_1, total_pymnt_round_s...
46 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8289079127894546, 0.8241442071843049, 0.832... 0.829038 (total_pymnt_round_segm_1, total_pymnt_round_s...
47 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8286533535175773, 0.8242787915756792, 0.832... 0.828998 (total_pymnt_round_segm_1, total_pymnt_round_s...
48 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8282594456065784, 0.8244554995752567, 0.832... 0.828971 (total_pymnt_round_segm_1, total_pymnt_round_s...
49 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8283608682089921, 0.8245476561522515, 0.832... 0.828916 (total_pymnt_round_segm_1, total_pymnt_round_s...
50 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8282044367374727, 0.8243196088867714, 0.832... 0.82886 (total_pymnt_round_segm_1, total_pymnt_round_s...
51 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8284016951040315, 0.8242619707635674, 0.832... 0.828829 (total_pymnt_round_segm_1, total_pymnt_round_s...
52 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8284335329195492, 0.8240394349982263, 0.832... 0.828781 (total_pymnt_round_segm_1, total_pymnt_round_s...
53 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... [0.8281119674015149, 0.8239536130674949, 0.832... 0.828654 (total_pymnt_round_segm_1, total_pymnt_round_s...
54 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8281095500195719, 0.823945793179311, 0.8328... 0.828561 (total_pymnt_round_segm_1, total_pymnt_round_s...
55 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8279925129204588, 0.8239311197049157, 0.832... 0.828534 (total_pymnt_round_segm_1, total_pymnt_round_s...
56 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8278504783014007, 0.8238715668722479, 0.832... 0.828366 (total_pymnt_round_segm_1, total_pymnt_round_s...
  • We choose to stop were the performance stops improving, in order to prevent the overfit as well

This is occurred the combination in row 27

In [72]:
sfs_log_reg_pdf['feature_idx'][27]
Out[72]:
(0,
 1,
 2,
 3,
 4,
 5,
 7,
 8,
 10,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 23,
 24,
 27,
 28,
 35,
 36,
 42,
 49,
 53)
In [73]:
X_train_encoded
Out[73]:
total_pymnt_round_segm_1 total_pymnt_round_segm_2 total_pymnt_round_segm_3 verification_status_segm_1 verification_status_segm_2 mths_since_recent_inq_segm_1 mths_since_recent_inq_segm_2 mths_since_recent_inq_segm_-1 inq_last_6mths_segm_4 inq_last_6mths_segm_2 inq_last_6mths_segm_3 inq_last_12m_segm_2 inq_last_12m_segm_3 fico_range_low_segm_1 fico_range_low_segm_2 fico_range_low_segm_4 fico_range_low_segm_5 purpose_segm_2 purpose_segm_1 instlmnt_round_segm_1 instlmnt_round_segm_2 instlmnt_round_segm_3 acc_open_past_24mths_segm_2 acc_open_past_24mths_segm_3 term_segm_2 open_rv_24m_segm_2 open_rv_24m_segm_3 open_acc_6m_segm_1 mort_acc_segm_1 mort_acc_segm_4 mort_acc_segm_3 open_rv_12m_segm_1 revol_bal_segm_1 revol_bal_segm_3 revol_bal_segm_2 emp_length_segm_2 emp_length_segm_1 open_il_12m_segm_1 mths_since_last_major_derog_segm_1 mths_since_last_major_derog_segm_-1 mths_since_last_major_derog_segm_2 Annual_Inc_round_segm_1 Annual_Inc_round_segm_3 Annual_Inc_round_segm_2 dti_rounded_segm_2 dti_rounded_segm_1 mths_since_last_record_segm_-1 mths_since_last_record_segm_2 mths_since_last_delinq_segm_1 mths_since_last_delinq_segm_2 delinq_2yrs_segm_3 delinq_2yrs_segm_4 delinq_2yrs_segm_1 application_type_segm_2 home_ownership_segm_2 years_with_Credit_line_segm_2
0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1
2 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0
3 0 1 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
4 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1
107860 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 1
107861 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1
107862 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1
107863 0 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0

107864 rows × 56 columns

  • Select the corresponding Indexes from the training and test sets (with the dummy variables)
In [74]:
X_train_log_reg = X_train_encoded.iloc[:,[0, 1,
                 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 35, 36, 42, 49, 53]]
X_test_log_reg  = X_test_encoded.iloc[:, [0, 1,
                 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 35, 36, 42, 49, 53]]
X_train_log_reg
Out[74]:
total_pymnt_round_segm_1 total_pymnt_round_segm_2 total_pymnt_round_segm_3 verification_status_segm_1 verification_status_segm_2 mths_since_recent_inq_segm_1 mths_since_recent_inq_segm_-1 inq_last_6mths_segm_4 inq_last_6mths_segm_3 fico_range_low_segm_1 fico_range_low_segm_2 fico_range_low_segm_4 fico_range_low_segm_5 purpose_segm_2 purpose_segm_1 instlmnt_round_segm_1 instlmnt_round_segm_2 instlmnt_round_segm_3 acc_open_past_24mths_segm_3 term_segm_2 open_acc_6m_segm_1 mort_acc_segm_1 emp_length_segm_2 emp_length_segm_1 Annual_Inc_round_segm_3 mths_since_last_delinq_segm_2 application_type_segm_2
0 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1
1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1
2 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0
3 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1
4 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 1 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1
107860 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1
107861 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1
107862 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1
107863 0 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1

107864 rows × 27 columns

  • Fit Logistic Regression model on the best performing combination
In [75]:
log_reg.fit(X_train_log_reg, y_train)
Out[75]:
LogisticRegression(random_state=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=100)
  • Training data: KS - AUC - Gini
In [76]:
calculate_DiscriminatoryStats(X_train_log_reg, y_train, log_reg, 'TRAINING')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             171               1        0.0       1.0    1.0             0.0   
1             178               1        0.0       1.0    1.0             0.0   
2             179               2        0.0       2.0    2.0             0.0   
3             181               1        0.0       1.0    1.0             0.0   
4             183               2        0.0       2.0    2.0             0.0   
..            ...             ...        ...       ...    ...             ...   
595           820               2        2.0       0.0    2.0        100940.0   
596           821               2        2.0       0.0    2.0        100942.0   
597           824               2        2.0       0.0    2.0        100944.0   
598           829               1        1.0       0.0    1.0        100945.0   
599           832               1        1.0       0.0    1.0        100946.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0              1.0               0.0             0.01            0.00   
1              2.0               0.0             0.01            0.00   
2              4.0               0.0             0.03            0.00   
3              5.0               0.0             0.01            0.00   
4              7.0               0.0             0.03            0.00   
..             ...               ...              ...             ...   
595         6918.0               0.0             0.00           99.99   
596         6918.0               0.0             0.00          100.00   
597         6918.0               0.0             0.00          100.00   
598         6918.0               0.0             0.00          100.00   
599         6918.0               0.0             0.00          100.00   

     cum_perc_bads  Separation  
0             0.01       -0.01  
1             0.03       -0.03  
2             0.06       -0.06  
3             0.07       -0.07  
4             0.10       -0.10  
..             ...         ...  
595         100.00       -0.01  
596         100.00        0.00  
597         100.00        0.00  
598         100.00        0.00  
599         100.00        0.00  

[600 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAINING data is: 46.86
AUC metric on the TRAINING data is: 0.83
Gini metric on the TRAINING data is: 0.66
  • Test data: KS - AUC - Gini
In [77]:
calculate_DiscriminatoryStats(X_test_log_reg, y_test, log_reg, 'TEST')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             167               1        0.0       1.0    1.0             0.0   
1             175               1        0.0       1.0    1.0             0.0   
2             182               1        0.0       1.0    1.0             0.0   
3             183               2        0.0       2.0    2.0             0.0   
4             184               1        0.0       1.0    1.0             0.0   
..            ...             ...        ...       ...    ...             ...   
570           813               1        1.0       0.0    1.0         40545.0   
571           814               3        3.0       0.0    3.0         40548.0   
572           815               1        1.0       0.0    1.0         40549.0   
573           818               1        1.0       0.0    1.0         40550.0   
574           824               1        1.0       0.0    1.0         40551.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0              1.0              0.00             0.04            0.00   
1              2.0              0.00             0.04            0.00   
2              3.0              0.00             0.04            0.00   
3              5.0              0.00             0.08            0.00   
4              6.0              0.00             0.04            0.00   
..             ...               ...              ...             ...   
570         2377.0              0.00             0.00           99.99   
571         2377.0              0.01             0.00           99.99   
572         2377.0              0.00             0.00          100.00   
573         2377.0              0.00             0.00          100.00   
574         2377.0              0.00             0.00          100.00   

     cum_perc_bads  Separation  
0             0.04       -0.04  
1             0.08       -0.08  
2             0.13       -0.13  
3             0.21       -0.21  
4             0.25       -0.25  
..             ...         ...  
570         100.00       -0.01  
571         100.00       -0.01  
572         100.00        0.00  
573         100.00        0.00  
574         100.00        0.00  

[575 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 43.40
AUC metric on the TEST data is: 0.81
Gini metric on the TEST data is: 0.63

Population Stability Index (PSI) & IV comparison¶

In [78]:
calculate_and_plot_psi(X_train_log_reg, X_test_log_reg, log_reg, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.166
Moderate shift in the population (PSI = 0.166)
In [79]:
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_log_reg, X_test_log_reg, y_train, y_test)
iv_comparison_df
Out[79]:
Variable IV_Train IV_Test
0 total_pymnt_round_segm_1 28.43 33.87
2 total_pymnt_round_segm_3 11.75 21.32
7 inq_last_6mths_segm_4 5.97 8.43
5 mths_since_recent_inq_segm_1 5.91 7.37
3 verification_status_segm_1 5.11 4.83
19 term_segm_2 4.77 2.90
9 fico_range_low_segm_1 3.38 3.89
11 fico_range_low_segm_4 3.19 2.70
21 mort_acc_segm_1 2.77 3.32
15 instlmnt_round_segm_1 2.69 2.55
20 open_acc_6m_segm_1 2.63 3.67
18 acc_open_past_24mths_segm_3 2.23 3.59
6 mths_since_recent_inq_segm_-1 2.18 2.89
1 total_pymnt_round_segm_2 1.87 7.45
12 fico_range_low_segm_5 1.29 1.08
23 emp_length_segm_1 1.28 1.33
8 inq_last_6mths_segm_3 1.27 2.07
24 Annual_Inc_round_segm_3 1.07 1.37
13 purpose_segm_2 0.81 1.11
14 purpose_segm_1 0.79 0.66
17 instlmnt_round_segm_3 0.49 0.84
10 fico_range_low_segm_2 0.39 0.21
26 application_type_segm_2 0.37 0.49
16 instlmnt_round_segm_2 0.29 0.46
4 verification_status_segm_2 0.06 0.18
22 emp_length_segm_2 0.03 0.05
25 mths_since_last_delinq_segm_2 0.03 0.01

¶

Random Forest - Credit Score model¶

  • Feature Selection

Random Forest does not need segmented variables -it is a decision tree model and can handle categorical data exeptionally well

In [80]:
# initialize the Randomg forest classifier
Random_forest_clf = RandomForestClassifier(n_estimators=50, random_state=100,
                               max_depth=7,
                               max_features=7,
                               min_samples_leaf=50,
                               min_samples_split=50,
                               bootstrap=True, 
                               criterion="entropy")


# Logistic Regression - Forward stepwise selection
sfs_forward = SFS(Random_forest_clf,
                  k_features='best', 
                  forward=True,
                  floating=False,
                  scoring='roc_auc',
                  cv=5)

# Fit the SFS to the training data
sfs_RandomForest_fitted = sfs_forward.fit(X_train_segmented_final, y_train)

# Create a DataFrame to store the results
sfs_RandomForest_pdf = pd.DataFrame(sfs_RandomForest_fitted.subsets_).T
sfs_RandomForest_pdf
Out[80]:
feature_idx cv_scores avg_score feature_names
1 (0,) [0.6623325873471722, 0.6501395268470183, 0.650... 0.658824 (total_pymnt_round_segm,)
2 (0, 7) [0.78496585403239, 0.7685981098560797, 0.78386... 0.780542 (total_pymnt_round_segm, instlmnt_round_segm)
3 (0, 1, 7) [0.7961419479514388, 0.7896147124986794, 0.801... 0.797495 (total_pymnt_round_segm, verification_status_s...
4 (0, 1, 7, 9) [0.80373696807301, 0.7981437052799026, 0.81151... 0.805966 (total_pymnt_round_segm, verification_status_s...
5 (0, 1, 5, 7, 9) [0.8135796341122318, 0.8062696069827986, 0.815... 0.813918 (total_pymnt_round_segm, verification_status_s...
6 (0, 1, 3, 5, 7, 9) [0.8148312291366508, 0.8100804693335838, 0.820... 0.817563 (total_pymnt_round_segm, verification_status_s...
7 (0, 1, 3, 5, 7, 9, 14) [0.8139364934066355, 0.8137097742804486, 0.823... 0.819501 (total_pymnt_round_segm, verification_status_s...
8 (0, 1, 3, 5, 7, 9, 14, 15) [0.8159134999858539, 0.8146649816746199, 0.825... 0.820944 (total_pymnt_round_segm, verification_status_s...
9 (0, 1, 3, 5, 7, 9, 14, 15, 23) [0.8156785841805809, 0.8168742342951593, 0.826... 0.822179 (total_pymnt_round_segm, verification_status_s...
10 (0, 1, 2, 3, 5, 7, 9, 14, 15, 23) [0.8183933757288406, 0.8170633252755679, 0.827... 0.823524 (total_pymnt_round_segm, verification_status_s...
11 (0, 1, 2, 3, 5, 7, 9, 14, 15, 23, 24) [0.8198957517466927, 0.8170878765034586, 0.827... 0.82384 (total_pymnt_round_segm, verification_status_s...
12 (0, 1, 2, 3, 5, 7, 9, 12, 14, 15, 23, 24) [0.8207472790127914, 0.8184729988029307, 0.827... 0.824214 (total_pymnt_round_segm, verification_status_s...
13 (0, 1, 2, 3, 5, 7, 9, 12, 14, 15, 19, 23, 24) [0.8211393425509002, 0.8199693710905929, 0.827... 0.824896 (total_pymnt_round_segm, verification_status_s...
14 (0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 23, 24) [0.8229057324900073, 0.8185682332306523, 0.827... 0.824727 (total_pymnt_round_segm, verification_status_s...
15 (0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, ... [0.8221078890095789, 0.8203541203259257, 0.827... 0.825102 (total_pymnt_round_segm, verification_status_s...
16 (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 14, 15, 19, 2... [0.8243333487329516, 0.8196154540459708, 0.826... 0.824801 (total_pymnt_round_segm, verification_status_s...
17 (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1... [0.8244440290128809, 0.821399676954514, 0.8269... 0.825399 (total_pymnt_round_segm, verification_status_s...
18 (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1... [0.8242124617292625, 0.8232964024649146, 0.827... 0.825939 (total_pymnt_round_segm, verification_status_s...
19 (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... [0.8236040514605106, 0.8217669790709015, 0.826... 0.825206 (total_pymnt_round_segm, verification_status_s...
20 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8248495940768055, 0.8211275734768861, 0.826... 0.825175 (total_pymnt_round_segm, verification_status_s...
21 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8256272031750432, 0.8226433612764005, 0.828... 0.825577 (total_pymnt_round_segm, verification_status_s...
22 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8235568498397545, 0.822390118581715, 0.8245... 0.824789 (total_pymnt_round_segm, verification_status_s...
23 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8261224978753898, 0.8212807502339882, 0.825... 0.824938 (total_pymnt_round_segm, verification_status_s...
24 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8250672658908841, 0.82103207063195, 0.82521... 0.824195 (total_pymnt_round_segm, verification_status_s...
25 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... [0.8248818258360471, 0.8219798160275569, 0.824... 0.82413 (total_pymnt_round_segm, verification_status_s...
26 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8224554727199342, 0.8205941032102126, 0.824... 0.822521 (total_pymnt_round_segm, verification_status_s...
In [81]:
# Again, choose to stop where the performance stops improving
sfs_RandomForest_pdf['feature_idx'][15]
Out[81]:
(0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24)
In [82]:
X_train_RandomForest = X_train_segmented_final.iloc[:,[0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24]]
X_test_RandomForest  = X_test_segmented_final.iloc[:, [0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24]]
X_train_RandomForest
Out[82]:
total_pymnt_round_segm verification_status_segm mths_since_recent_inq_segm inq_last_6mths_segm fico_range_low_segm instlmnt_round_segm term_segm open_acc_6m_segm mort_acc_segm revol_bal_segm emp_length_segm dti_rounded_segm delinq_2yrs_segm application_type_segm home_ownership_segm
0 1 1 1 4 1 1 2 1 1 1 2 2 3 2 2
1 2 2 2 4 2 2 2 2 4 3 2 2 4 2 1
2 1 3 1 2 1 1 1 1 1 1 2 2 3 -1 2
3 2 2 1 3 1 2 1 1 3 1 1 2 3 2 2
4 1 2 2 4 4 1 2 1 4 1 1 1 4 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 3 3 -1 4 1 1 1 1 2 2 -1 2 4 2 2
107860 1 3 3 4 2 1 2 1 4 3 1 2 4 2 2
107861 1 2 2 4 2 1 2 1 3 1 1 2 4 2 2
107862 2 1 3 4 2 1 2 2 2 2 2 2 3 2 2
107863 3 2 1 3 1 1 2 1 1 1 2 1 4 2 1

107864 rows × 15 columns

  • Hyperparameter Tuning -right after the feature selection
In [83]:
from skopt import BayesSearchCV
from sklearn.model_selection import KFold, cross_validate
import numpy as np
import matplotlib.pyplot as plt

# Define the parameter space for Bayesian optimization
hyperparameter_space = {
    'n_estimators': (50, 200),
    'max_depth': (6, 9),
    'max_features': (6, 9),
    'criterion': ['gini','entropy'],
    'min_samples_leaf': (70, 150),
    'min_samples_split': (70, 150)
}

# Setup BayesSearchCV
opt = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=100),
    search_spaces=hyperparameter_space,
    scoring='roc_auc',
    n_iter=20,
    cv=5,
    return_train_score=True,
    random_state=100
)

# Fit the BayesSearchCV
opt.fit(X_train_RandomForest, y_train)

# Best estimator found by BayesSearchCV
best_rfc = opt.best_estimator_

# Use KFold for the cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=100)

# Perform cross-validation using 'roc_auc' metric
results = cross_validate(best_rfc, X_train_RandomForest, y_train, cv=kf, scoring='roc_auc')

# Optionally plot the AUC results
plt.errorbar(range(len(results['test_score'])), results['test_score'], fmt='-o')
plt.ylabel('AUC Score')
plt.xlabel('Fold Number')
plt.title('Random Forest AUC Performance across Folds')
plt.show()
  • Obtain an optimal set of hyperparameters
In [84]:
best_rfc
Out[84]:
RandomForestClassifier(max_depth=9, max_features=9, min_samples_leaf=70,
                       min_samples_split=134, n_estimators=198,
                       random_state=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=9, max_features=9, min_samples_leaf=70,
                       min_samples_split=134, n_estimators=198,
                       random_state=100)
  • Fit the final Random Forest classifier
In [85]:
# initialize the Randomg forest classifier
Random_forest_clf = RandomForestClassifier(n_estimators=198, n_jobs=-1, random_state=100,
                               max_depth=9,
                               max_features=9,
                               min_samples_leaf=70,
                               min_samples_split=134,
                               bootstrap=True, 
                               criterion="entropy")

Random_forest_clf.fit(X_train_RandomForest, y_train)
Out[85]:
RandomForestClassifier(criterion='entropy', max_depth=9, max_features=9,
                       min_samples_leaf=70, min_samples_split=134,
                       n_estimators=198, n_jobs=-1, random_state=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy', max_depth=9, max_features=9,
                       min_samples_leaf=70, min_samples_split=134,
                       n_estimators=198, n_jobs=-1, random_state=100)
In [86]:
calculate_DiscriminatoryStats(X_train_RandomForest, y_train, Random_forest_clf, 'TRAIN')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             268              12        0.0      12.0   12.0             0.0   
1             269               2        0.0       2.0    2.0             0.0   
2             270               7        0.0       7.0    7.0             0.0   
3             271               1        0.0       1.0    1.0             0.0   
4             275               7        0.0       7.0    7.0             0.0   
..            ...             ...        ...       ...    ...             ...   
375           685               1        1.0       0.0    1.0        100936.0   
376           686               2        2.0       0.0    2.0        100938.0   
377           695               4        4.0       0.0    4.0        100942.0   
378           696               3        3.0       0.0    3.0        100945.0   
379           699               1        1.0       0.0    1.0        100946.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0             12.0               0.0             0.17            0.00   
1             14.0               0.0             0.03            0.00   
2             21.0               0.0             0.10            0.00   
3             22.0               0.0             0.01            0.00   
4             29.0               0.0             0.10            0.00   
..             ...               ...              ...             ...   
375         6918.0               0.0             0.00           99.99   
376         6918.0               0.0             0.00           99.99   
377         6918.0               0.0             0.00          100.00   
378         6918.0               0.0             0.00          100.00   
379         6918.0               0.0             0.00          100.00   

     cum_perc_bads  Separation  
0             0.17       -0.17  
1             0.20       -0.20  
2             0.30       -0.30  
3             0.32       -0.32  
4             0.42       -0.42  
..             ...         ...  
375         100.00       -0.01  
376         100.00       -0.01  
377         100.00        0.00  
378         100.00        0.00  
379         100.00        0.00  

[380 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAIN data is: 48.74
AUC metric on the TRAIN data is: 0.84
Gini metric on the TRAIN data is: 0.68
In [87]:
calculate_DiscriminatoryStats(X_test_RandomForest, y_test, Random_forest_clf, 'TEST')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             266               1        0.0       1.0    1.0             0.0   
1             267               1        0.0       1.0    1.0             0.0   
2             268               3        1.0       2.0    3.0             1.0   
3             270               2        2.0       0.0    2.0             3.0   
4             275               6        1.0       5.0    6.0             4.0   
..            ...             ...        ...       ...    ...             ...   
368           683               2        2.0       0.0    2.0         40544.0   
369           684               3        3.0       0.0    3.0         40547.0   
370           686               2        2.0       0.0    2.0         40549.0   
371           687               1        1.0       0.0    1.0         40550.0   
372           695               1        1.0       0.0    1.0         40551.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0              1.0              0.00             0.04            0.00   
1              2.0              0.00             0.04            0.00   
2              4.0              0.00             0.08            0.00   
3              4.0              0.00             0.00            0.01   
4              9.0              0.00             0.21            0.01   
..             ...               ...              ...             ...   
368         2377.0              0.00             0.00           99.98   
369         2377.0              0.01             0.00           99.99   
370         2377.0              0.00             0.00          100.00   
371         2377.0              0.00             0.00          100.00   
372         2377.0              0.00             0.00          100.00   

     cum_perc_bads  Separation  
0             0.04       -0.04  
1             0.08       -0.08  
2             0.17       -0.17  
3             0.17       -0.16  
4             0.38       -0.37  
..             ...         ...  
368         100.00       -0.02  
369         100.00       -0.01  
370         100.00        0.00  
371         100.00        0.00  
372         100.00        0.00  

[373 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 42.39
AUC metric on the TEST data is: 0.81
Gini metric on the TEST data is: 0.62

PSI & IVs for the Random Forest¶

In [88]:
calculate_and_plot_psi(X_train_RandomForest, X_test_RandomForest, Random_forest_clf, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.150
Moderate shift in the population (PSI = 0.150)
In [89]:
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_RandomForest, X_test_RandomForest, y_train, y_test)
iv_comparison_df
Out[89]:
Variable IV_Train IV_Test
0 total_pymnt_round_segm 55.61 55.74
2 mths_since_recent_inq_segm 7.76 10.87
1 verification_status_segm 7.54 7.90
3 inq_last_6mths_segm 7.34 9.99
4 fico_range_low_segm 6.50 6.33
6 term_segm 4.77 2.90
5 instlmnt_round_segm 3.04 2.70
8 mort_acc_segm 2.97 3.75
10 emp_length_segm 2.74 2.83
7 open_acc_6m_segm 2.63 3.67
9 revol_bal_segm 1.64 2.66
12 delinq_2yrs_segm 0.50 0.10
11 dti_rounded_segm 0.37 0.76
13 application_type_segm 0.37 0.49
14 home_ownership_segm 0.06 0.13

¶

Gradient Boosting - Credit Score model¶

  • Feature Selection

Gradient Boosting does not need segmented variables -it can handle categorical data exeptionally well

In [91]:
# initialize the Randomg forest classifier
GradientBoosting_clf = GradientBoostingClassifier(n_estimators=50, random_state=100,
                               max_depth = 7,
                               min_samples_split = 50,
                               min_samples_leaf = 50
                               )


# GradientBoosting - Forward selection
sfs_forward = SFS(GradientBoosting_clf,
                  k_features='best', 
                  forward=True,
                  floating=False,
                  scoring='roc_auc',
                  cv=5)

# Fit the SFS to the training data
sfs_GB_fitted = sfs_forward.fit(X_train_segmented_final, y_train)

# Create a DataFrame to store the results
sfs_GB_pdf = pd.DataFrame(sfs_GB_fitted.subsets_).T
sfs_GB_pdf
Out[91]:
feature_idx cv_scores avg_score feature_names
1 (0,) [0.6623325873471722, 0.6501395268470183, 0.650... 0.658824 (total_pymnt_round_segm,)
2 (0, 7) [0.7849806448285755, 0.7685981098560798, 0.783... 0.780537 (total_pymnt_round_segm, instlmnt_round_segm)
3 (0, 1, 7) [0.7960789706751874, 0.7896001642856509, 0.801... 0.797504 (total_pymnt_round_segm, verification_status_s...
4 (0, 1, 7, 9) [0.8036432810928142, 0.7984550513542973, 0.811... 0.806297 (total_pymnt_round_segm, verification_status_s...
5 (0, 1, 5, 7, 9) [0.81259010119698, 0.805397125774151, 0.815951... 0.813734 (total_pymnt_round_segm, verification_status_s...
6 (0, 1, 3, 5, 7, 9) [0.8157633895204522, 0.8087837815590645, 0.819... 0.817315 (total_pymnt_round_segm, verification_status_s...
7 (0, 1, 3, 5, 7, 9, 14) [0.818077379142542, 0.8158363722933882, 0.8232... 0.820773 (total_pymnt_round_segm, verification_status_s...
8 (0, 1, 3, 5, 7, 9, 14, 23) [0.8208706192114893, 0.8193732164570817, 0.825... 0.823546 (total_pymnt_round_segm, verification_status_s...
9 (0, 1, 3, 5, 7, 9, 14, 15, 23) [0.8252111269762993, 0.82227545074766, 0.82740... 0.82551 (total_pymnt_round_segm, verification_status_s...
10 (0, 1, 3, 5, 6, 7, 9, 14, 15, 23) [0.8253856619525928, 0.8221907203802679, 0.831... 0.826683 (total_pymnt_round_segm, verification_status_s...
11 (0, 1, 3, 5, 6, 7, 9, 12, 14, 15, 23) [0.8264928586956094, 0.824087750096845, 0.8318... 0.82782 (total_pymnt_round_segm, verification_status_s...
12 (0, 1, 3, 5, 6, 7, 9, 12, 13, 14, 15, 23) [0.8274306238242124, 0.8259570970513617, 0.833... 0.82924 (total_pymnt_round_segm, verification_status_s...
13 (0, 1, 2, 3, 5, 6, 7, 9, 12, 13, 14, 15, 23) [0.8288338334627975, 0.8229335918632508, 0.835... 0.829392 (total_pymnt_round_segm, verification_status_s...
14 (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23) [0.827908638720299, 0.8246504778399043, 0.8366... 0.830019 (total_pymnt_round_segm, verification_status_s...
15 (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 2... [0.829238198789017, 0.8251433097402951, 0.8371... 0.830789 (total_pymnt_round_segm, verification_status_s...
16 (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 2... [0.8305616169169462, 0.8241356894113632, 0.836... 0.830637 (total_pymnt_round_segm, verification_status_s...
17 (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 1... [0.8307545777156063, 0.8248524349521302, 0.837... 0.830818 (total_pymnt_round_segm, verification_status_s...
18 (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 1... [0.8308250578291482, 0.824376907014837, 0.8370... 0.830737 (total_pymnt_round_segm, verification_status_s...
19 (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... [0.8297277454922989, 0.825593534881497, 0.8364... 0.83092 (total_pymnt_round_segm, verification_status_s...
20 (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... [0.8301681208562044, 0.824090720580686, 0.8356... 0.830367 (total_pymnt_round_segm, verification_status_s...
21 (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... [0.8303448583360461, 0.8251569453347942, 0.836... 0.830744 (total_pymnt_round_segm, verification_status_s...
22 (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... [0.83066303951936, 0.8255756225060463, 0.83691... 0.830938 (total_pymnt_round_segm, verification_status_s...
23 (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... [0.8299909715261057, 0.8250851526771432, 0.835... 0.830453 (total_pymnt_round_segm, verification_status_s...
24 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.83006687731912, 0.8249857846243176, 0.83611... 0.83042 (total_pymnt_round_segm, verification_status_s...
25 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8316857353335647, 0.8256496340793376, 0.835... 0.830279 (total_pymnt_round_segm, verification_status_s...
26 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... [0.8308314862744635, 0.8267472815420289, 0.834... 0.830266 (total_pymnt_round_segm, verification_status_s...
In [92]:
# Again, choose to stop where the performance stops improving
sfs_GB_pdf['feature_idx'][15]
Out[92]:
(0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24)
In [93]:
X_train_GB = X_train_segmented_final.iloc[:,[0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24]]
X_test_GB  = X_test_segmented_final.iloc[:, [0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24]]
X_train_GB
Out[93]:
total_pymnt_round_segm verification_status_segm mths_since_recent_inq_segm inq_last_6mths_segm fico_range_low_segm purpose_segm instlmnt_round_segm term_segm open_acc_6m_segm mort_acc_segm open_rv_12m_segm revol_bal_segm emp_length_segm application_type_segm home_ownership_segm
0 1 1 1 4 1 2 1 2 1 1 1 1 2 2 2
1 2 2 2 4 2 2 2 2 2 4 2 3 2 2 1
2 1 3 1 2 1 1 1 1 1 1 1 1 2 -1 2
3 2 2 1 3 1 2 2 1 1 3 1 1 1 2 2
4 1 2 2 4 4 1 1 2 1 4 1 1 1 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107859 3 3 -1 4 1 2 1 1 1 2 1 2 -1 2 2
107860 1 3 3 4 2 2 1 2 1 4 1 3 1 2 2
107861 1 2 2 4 2 2 1 2 1 3 1 1 1 2 2
107862 2 1 3 4 2 2 1 2 2 2 2 2 2 2 2
107863 3 2 1 3 1 1 1 2 1 1 1 1 2 2 1

107864 rows × 15 columns

  • Hyperparameter Tuning -right after the feature selection
In [94]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Define the parameter space for Bayesian optimization
hyperparameter_space = {
    'n_estimators': Integer(50, 200),
    'learning_rate': Real(0.001, 0.2, prior='uniform'),
    'min_impurity_decrease': Real(0.001, 0.2, prior='uniform'),
    'max_depth': Integer(6, 9),
    'max_features': Integer(6, 9),
    'min_samples_leaf': Integer(70, 150),
    'min_samples_split': Integer(70, 150),
    'subsample': Real(0.5, 0.8, prior='uniform')
}

# Setup BayesSearchCV
opt = BayesSearchCV(
    estimator=GradientBoostingClassifier(random_state=100),
    search_spaces=hyperparameter_space,
    scoring='roc_auc',
    n_iter=20,
    cv=5,
    return_train_score=True,
    random_state=100
)

# Fit the BayesSearchCV
opt.fit(X_train_GB, y_train)

# Best estimator found by BayesSearchCV
best_GBc = opt.best_estimator_

# Use KFold for the cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=100)

# Perform cross-validation using 'roc_auc' metric
results = cross_validate(best_GBc, X_train_GB, y_train, cv=kf, scoring='roc_auc')

# Optionally plot the AUC results
plt.errorbar(range(len(results['test_score'])), results['test_score'], fmt='-o')
plt.ylabel('AUC Score')
plt.xlabel('Fold Number')
plt.title('GradientBoosting AUC Performance across Folds')
plt.show()
  • Optimal hyperparameters
In [95]:
best_GBc
Out[95]:
GradientBoostingClassifier(learning_rate=0.11594796160025214, max_depth=6,
                           max_features=8,
                           min_impurity_decrease=0.14285292256728815,
                           min_samples_leaf=104, min_samples_split=72,
                           n_estimators=71, random_state=100,
                           subsample=0.6381717003491763)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier(learning_rate=0.11594796160025214, max_depth=6,
                           max_features=8,
                           min_impurity_decrease=0.14285292256728815,
                           min_samples_leaf=104, min_samples_split=72,
                           n_estimators=71, random_state=100,
                           subsample=0.6381717003491763)
In [96]:
# initialize the GradientBoosting
GradientBoosting_clf = GradientBoostingClassifier(n_estimators=71, random_state=100,
                               max_depth = 6,
                               max_features = 8,
                               min_impurity_decrease = 0.14285,
                               learning_rate = 0.11595, 
                               min_samples_split = 72,
                               min_samples_leaf = 104,
                               subsample = 0.638
                               )

GradientBoosting_clf.fit(X_train_GB, y_train)
Out[96]:
GradientBoostingClassifier(learning_rate=0.11595, max_depth=6, max_features=8,
                           min_impurity_decrease=0.14285, min_samples_leaf=104,
                           min_samples_split=72, n_estimators=71,
                           random_state=100, subsample=0.638)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier(learning_rate=0.11595, max_depth=6, max_features=8,
                           min_impurity_decrease=0.14285, min_samples_leaf=104,
                           min_samples_split=72, n_estimators=71,
                           random_state=100, subsample=0.638)
In [97]:
calculate_DiscriminatoryStats(X_train_GB, y_train, GradientBoosting_clf, 'TRAIN')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             252               1        0.0       1.0    1.0             0.0   
1             255               1        0.0       1.0    1.0             0.0   
2             256               1        0.0       1.0    1.0             0.0   
3             257               1        0.0       1.0    1.0             0.0   
4             258               1        0.0       1.0    1.0             0.0   
..            ...             ...        ...       ...    ...             ...   
297           565              24       24.0       0.0   24.0        100901.0   
298           566              20       20.0       0.0   20.0        100921.0   
299           567              19       19.0       0.0   19.0        100940.0   
300           568               5        5.0       0.0    5.0        100945.0   
301           570               1        1.0       0.0    1.0        100946.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0              1.0              0.00             0.01            0.00   
1              2.0              0.00             0.01            0.00   
2              3.0              0.00             0.01            0.00   
3              4.0              0.00             0.01            0.00   
4              5.0              0.00             0.01            0.00   
..             ...               ...              ...             ...   
297         6918.0              0.02             0.00           99.96   
298         6918.0              0.02             0.00           99.98   
299         6918.0              0.02             0.00           99.99   
300         6918.0              0.00             0.00          100.00   
301         6918.0              0.00             0.00          100.00   

     cum_perc_bads  Separation  
0             0.01       -0.01  
1             0.03       -0.03  
2             0.04       -0.04  
3             0.06       -0.06  
4             0.07       -0.07  
..             ...         ...  
297         100.00       -0.04  
298         100.00       -0.02  
299         100.00       -0.01  
300         100.00        0.00  
301         100.00        0.00  

[302 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAIN data is: 49.23
AUC metric on the TRAIN data is: 0.84
Gini metric on the TRAIN data is: 0.68
In [99]:
calculate_DiscriminatoryStats(X_test_GB, y_test, GradientBoosting_clf, 'TEST')
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0             249               1        0.0       1.0    1.0             0.0   
1             252               1        0.0       1.0    1.0             0.0   
2             255               1        0.0       1.0    1.0             0.0   
3             257               2        0.0       2.0    2.0             0.0   
4             258               1        0.0       1.0    1.0             0.0   
..            ...             ...        ...       ...    ...             ...   
301           566               7        7.0       0.0    7.0         40543.0   
302           567               5        5.0       0.0    5.0         40548.0   
303           568               1        1.0       0.0    1.0         40549.0   
304           569               1        1.0       0.0    1.0         40550.0   
305           570               1        1.0       0.0    1.0         40551.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0              1.0              0.00             0.04            0.00   
1              2.0              0.00             0.04            0.00   
2              3.0              0.00             0.04            0.00   
3              5.0              0.00             0.08            0.00   
4              6.0              0.00             0.04            0.00   
..             ...               ...              ...             ...   
301         2377.0              0.02             0.00           99.98   
302         2377.0              0.01             0.00           99.99   
303         2377.0              0.00             0.00          100.00   
304         2377.0              0.00             0.00          100.00   
305         2377.0              0.00             0.00          100.00   

     cum_perc_bads  Separation  
0             0.04       -0.04  
1             0.08       -0.08  
2             0.13       -0.13  
3             0.21       -0.21  
4             0.25       -0.25  
..             ...         ...  
301         100.00       -0.02  
302         100.00       -0.01  
303         100.00        0.00  
304         100.00        0.00  
305         100.00        0.00  

[306 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 44.15
AUC metric on the TEST data is: 0.82
Gini metric on the TEST data is: 0.63

PSI and IVs for the GB classifier¶

In [100]:
calculate_and_plot_psi(X_train_GB, X_test_GB, GradientBoosting_clf, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.152
Moderate shift in the population (PSI = 0.152)
In [101]:
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_GB, X_test_GB, y_train, y_test)
iv_comparison_df
Out[101]:
Variable IV_Train IV_Test
0 total_pymnt_round_segm 55.61 55.74
2 mths_since_recent_inq_segm 7.76 10.87
1 verification_status_segm 7.54 7.90
3 inq_last_6mths_segm 7.34 9.99
4 fico_range_low_segm 6.50 6.33
7 term_segm 4.77 2.90
5 purpose_segm 3.34 3.62
6 instlmnt_round_segm 3.04 2.70
9 mort_acc_segm 2.97 3.75
12 emp_length_segm 2.74 2.83
8 open_acc_6m_segm 2.63 3.67
10 open_rv_12m_segm 2.04 3.30
11 revol_bal_segm 1.64 2.66
13 application_type_segm 0.37 0.49
14 home_ownership_segm 0.06 0.13

Neural Networks¶

In [102]:
import tensorflow as tf

# for the analysis of the optizimation procedure
from skopt.space import Real, Integer, Categorical

#####################  Tensorflow - Keras   #########################
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective, plot_evaluations
from skopt.utils import use_named_args
  • Also create a Validation Set

The validation set will be a small part of the training set.

We will use 20% of the Training set as validation set

In [103]:
categorical_columns = X_train_segmented_final.select_dtypes(include=['category', 'object']).columns
categorical_columns
Out[103]:
Index(['total_pymnt_round_segm', 'verification_status_segm',
       'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
       'inq_last_12m_segm', 'fico_range_low_segm', 'purpose_segm',
       'instlmnt_round_segm', 'acc_open_past_24mths_segm', 'term_segm',
       'open_rv_24m_segm', 'open_acc_6m_segm', 'mort_acc_segm',
       'open_rv_12m_segm', 'revol_bal_segm', 'emp_length_segm',
       'open_il_12m_segm', 'mths_since_last_major_derog_segm',
       'Annual_Inc_round_segm', 'dti_rounded_segm',
       'mths_since_last_record_segm', 'mths_since_last_delinq_segm',
       'delinq_2yrs_segm', 'application_type_segm', 'home_ownership_segm',
       'years_with_Credit_line_segm'],
      dtype='object')
In [104]:
X_train_segmented_final_int = X_train_segmented_final.copy()
X_test_segmented_final_int = X_test_segmented_final.copy()
In [105]:
for col in categorical_columns:
    X_train_segmented_final_int[col] = X_train_segmented_final[col].astype(int)
    X_test_segmented_final_int[col]  = X_test_segmented_final[col].astype(int)
In [106]:
y_train_int = y_train.astype(int)
y_train_int.shape
Out[106]:
(107864,)
In [107]:
X_train_nn, X_val, y_train_nn, y_val =  train_test_split(
    X_train_segmented_final_int, 
    y_train_int,  
    test_size=0.2,
    random_state=100,
    stratify = y_train_int)
X_train_nn.shape, X_val.shape

# Convert one-hot encoded targets to single integer labels if necessary
if len(y_train_nn.shape) > 1 and y_train_nn.shape[1] == 2:
    y_train_nn = y_train_nn.argmax(axis=1)
    y_val = y_val.argmax(axis=1)
In [108]:
X_train_nn.shape
Out[108]:
(86291, 26)
In [109]:
X_val.shape
Out[109]:
(21573, 26)
In [212]:
# Convert y_train into categorical cross entropy format
# num_classes = len(np.unique(y_train))
# y_train_nn = keras.utils.to_categorical(y_train_nn, num_classes)
# y_test_nn = keras.utils.to_categorical(y_test, num_classes)
# y_val_nn = keras.utils.to_categorical(y_val, num_classes)

# y_train_nn
  • Input Layer: The input shape is set to (26,) to match the 26 predictors.

  • Hidden Layers: Two dense layers with the specified number of nodes and activation functions.

  • Output Layer: A single node with a sigmoid activation function -for binary classification.

  • Optimizer: Adam optimizer with the specified learning rate.

  • Loss Function: binary_crossentropy -for binary classification problems.

  • Metrics: accuracy to monitor the accuracy during training and evaluation ("roc_auc" will be used)

  • Defining the hyperparameter space -will be more conservative in order not to overfit
In [110]:
tf.random.set_seed(100)

# Creating the model
model_opt = tf.keras.Sequential([
    
  tf.keras.layers.Dense(16, activation=tf.keras.activations.relu, input_shape=(26,)), # Input layer, relu activation
  tf.keras.layers.Dropout(0.2),  # Dropout layer with a 20% dropout rate
    
  tf.keras.layers.Dense(32, activation=tf.keras.activations.relu), # hidden layer 1, relu activation
  tf.keras.layers.Dropout(0.2),  # Dropout layer with a 20% dropout rate
    
  tf.keras.layers.Dense(64, activation=tf.keras.activations.relu), # hidden layer 2, relu activation
  tf.keras.layers.Dropout(0.2),  # Dropout layer with a 20% dropout rate
    
  tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid) # ouput layer, sigmoid activation
])

# Compile the model
model_opt.compile(loss=tf.keras.losses.BinaryCrossentropy(), # for binary problesms --> BinaryCrossentropy()
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['AUC'])

# Fit the model & obbtain the history curves
history = model_opt.fit(X_train_nn, y_train_nn,
                      validation_data=(X_val, y_val),
                      epochs=50, 
                      batch_size=16,
                      verbose=1)
Epoch 1/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.6185 - loss: 0.2454 - val_AUC: 0.7979 - val_loss: 0.1990
Epoch 2/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.7768 - loss: 0.2017 - val_AUC: 0.8146 - val_loss: 0.1886
Epoch 3/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.7950 - loss: 0.1878 - val_AUC: 0.8140 - val_loss: 0.1836
Epoch 4/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8041 - loss: 0.1822 - val_AUC: 0.8144 - val_loss: 0.1805
Epoch 5/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8034 - loss: 0.1812 - val_AUC: 0.8134 - val_loss: 0.1844
Epoch 6/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8033 - loss: 0.1805 - val_AUC: 0.8168 - val_loss: 0.1824
Epoch 7/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8052 - loss: 0.1795 - val_AUC: 0.8144 - val_loss: 0.1845
Epoch 8/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8075 - loss: 0.1787 - val_AUC: 0.8150 - val_loss: 0.1835
Epoch 9/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8099 - loss: 0.1785 - val_AUC: 0.8164 - val_loss: 0.1829
Epoch 10/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8122 - loss: 0.1783 - val_AUC: 0.8199 - val_loss: 0.1827
Epoch 11/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8099 - loss: 0.1790 - val_AUC: 0.8142 - val_loss: 0.1902
Epoch 12/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8119 - loss: 0.1773 - val_AUC: 0.8132 - val_loss: 0.1834
Epoch 13/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8094 - loss: 0.1783 - val_AUC: 0.8154 - val_loss: 0.1826
Epoch 14/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8105 - loss: 0.1776 - val_AUC: 0.8172 - val_loss: 0.1825
Epoch 15/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8162 - loss: 0.1765 - val_AUC: 0.8165 - val_loss: 0.1832
Epoch 16/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8134 - loss: 0.1776 - val_AUC: 0.8127 - val_loss: 0.1820
Epoch 17/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8103 - loss: 0.1774 - val_AUC: 0.8154 - val_loss: 0.1827
Epoch 18/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8133 - loss: 0.1775 - val_AUC: 0.8181 - val_loss: 0.1847
Epoch 19/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8123 - loss: 0.1775 - val_AUC: 0.8161 - val_loss: 0.1820
Epoch 20/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8141 - loss: 0.1772 - val_AUC: 0.8162 - val_loss: 0.1838
Epoch 21/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8163 - loss: 0.1764 - val_AUC: 0.8194 - val_loss: 0.1842
Epoch 22/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8149 - loss: 0.1765 - val_AUC: 0.8154 - val_loss: 0.1869
Epoch 23/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8145 - loss: 0.1774 - val_AUC: 0.8167 - val_loss: 0.1843
Epoch 24/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8154 - loss: 0.1759 - val_AUC: 0.8149 - val_loss: 0.1888
Epoch 25/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8164 - loss: 0.1766 - val_AUC: 0.8157 - val_loss: 0.1857
Epoch 26/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8151 - loss: 0.1768 - val_AUC: 0.8186 - val_loss: 0.1833
Epoch 27/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8164 - loss: 0.1759 - val_AUC: 0.8150 - val_loss: 0.1820
Epoch 28/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1762 - val_AUC: 0.8169 - val_loss: 0.1848
Epoch 29/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8180 - loss: 0.1752 - val_AUC: 0.8168 - val_loss: 0.1811
Epoch 30/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8166 - loss: 0.1771 - val_AUC: 0.8166 - val_loss: 0.1832
Epoch 31/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8155 - loss: 0.1761 - val_AUC: 0.8130 - val_loss: 0.1819
Epoch 32/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8163 - loss: 0.1761 - val_AUC: 0.8148 - val_loss: 0.1839
Epoch 33/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1760 - val_AUC: 0.8171 - val_loss: 0.1835
Epoch 34/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8193 - loss: 0.1752 - val_AUC: 0.8159 - val_loss: 0.1822
Epoch 35/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8188 - loss: 0.1755 - val_AUC: 0.8168 - val_loss: 0.1846
Epoch 36/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8176 - loss: 0.1764 - val_AUC: 0.8164 - val_loss: 0.1865
Epoch 37/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8159 - loss: 0.1761 - val_AUC: 0.8149 - val_loss: 0.1850
Epoch 38/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8213 - loss: 0.1758 - val_AUC: 0.8155 - val_loss: 0.1866
Epoch 39/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1757 - val_AUC: 0.8158 - val_loss: 0.1856
Epoch 40/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1770 - val_AUC: 0.8162 - val_loss: 0.1813
Epoch 41/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8181 - loss: 0.1761 - val_AUC: 0.8154 - val_loss: 0.1811
Epoch 42/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8175 - loss: 0.1755 - val_AUC: 0.8164 - val_loss: 0.1850
Epoch 43/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8192 - loss: 0.1756 - val_AUC: 0.8168 - val_loss: 0.1808
Epoch 44/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8167 - loss: 0.1763 - val_AUC: 0.8189 - val_loss: 0.1812
Epoch 45/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8182 - loss: 0.1759 - val_AUC: 0.8156 - val_loss: 0.1846
Epoch 46/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1757 - val_AUC: 0.8161 - val_loss: 0.1825
Epoch 47/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1764 - val_AUC: 0.8173 - val_loss: 0.1813
Epoch 48/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8175 - loss: 0.1762 - val_AUC: 0.8144 - val_loss: 0.1835
Epoch 49/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8201 - loss: 0.1753 - val_AUC: 0.8159 - val_loss: 0.1835
Epoch 50/50
5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8185 - loss: 0.1751 - val_AUC: 0.8160 - val_loss: 0.1807
In [111]:
# Plot the loss curves
pd.DataFrame(history.history).plot(title="Diagnostic plot - Optimized model")
Out[111]:
<Axes: title={'center': 'Diagnostic plot - Optimized model'}>
In [114]:
calculate_DiscriminatoryStats_nn(X_train_segmented_final_int, y_train, model_opt, 'TRAINING')
3371/3371 ━━━━━━━━━━━━━━━━━━━━ 2s 667us/step
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0               3             248        7.0     241.0  248.0             7.0   
1               5               2        0.0       2.0    2.0             7.0   
2               6               2        0.0       2.0    2.0             7.0   
3               8               1        0.0       1.0    1.0             7.0   
4               9               3        0.0       3.0    3.0             7.0   
..            ...             ...        ...       ...    ...             ...   
623           834               1        1.0       0.0    1.0        100942.0   
624           840               1        1.0       0.0    1.0        100943.0   
625           848               1        1.0       0.0    1.0        100944.0   
626           860               1        1.0       0.0    1.0        100945.0   
627           880               1        1.0       0.0    1.0        100946.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0            241.0              0.01             3.48            0.01   
1            243.0              0.00             0.03            0.01   
2            245.0              0.00             0.03            0.01   
3            246.0              0.00             0.01            0.01   
4            249.0              0.00             0.04            0.01   
..             ...               ...              ...             ...   
623         6918.0              0.00             0.00          100.00   
624         6918.0              0.00             0.00          100.00   
625         6918.0              0.00             0.00          100.00   
626         6918.0              0.00             0.00          100.00   
627         6918.0              0.00             0.00          100.00   

     cum_perc_bads  Separation  
0             3.48       -3.47  
1             3.51       -3.50  
2             3.54       -3.53  
3             3.56       -3.55  
4             3.60       -3.59  
..             ...         ...  
623         100.00        0.00  
624         100.00        0.00  
625         100.00        0.00  
626         100.00        0.00  
627         100.00        0.00  

[628 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAINING data is: 46.46
AUC metric on the TRAINING data is: 0.83
Gini metric on the TRAINING data is: 0.65
In [115]:
calculate_DiscriminatoryStats_nn(X_test_segmented_final_int, y_test, model_opt, 'TEST')
1342/1342 ━━━━━━━━━━━━━━━━━━━━ 1s 568us/step
     Credit Score  num_applicants  num_goods  num_bads  total  cum_freq_goods  \
0               2             137        1.0     136.0  137.0             1.0   
1               3               3        0.0       3.0    3.0             1.0   
2               5               1        0.0       1.0    1.0             1.0   
3               6               3        0.0       3.0    3.0             1.0   
4              12               1        0.0       1.0    1.0             1.0   
..            ...             ...        ...       ...    ...             ...   
554           797               1        1.0       0.0    1.0         40546.0   
555           814               1        1.0       0.0    1.0         40547.0   
556           834               1        1.0       0.0    1.0         40548.0   
557           848               1        1.0       0.0    1.0         40549.0   
558           880               2        2.0       0.0    2.0         40551.0   

     cum_freq_bads  perc_total_goods  perc_total_bads  cum_perc_goods  \
0            136.0               0.0             5.72            0.00   
1            139.0               0.0             0.13            0.00   
2            140.0               0.0             0.04            0.00   
3            143.0               0.0             0.13            0.00   
4            144.0               0.0             0.04            0.00   
..             ...               ...              ...             ...   
554         2377.0               0.0             0.00           99.99   
555         2377.0               0.0             0.00           99.99   
556         2377.0               0.0             0.00           99.99   
557         2377.0               0.0             0.00          100.00   
558         2377.0               0.0             0.00          100.00   

     cum_perc_bads  Separation  
0             5.72       -5.72  
1             5.85       -5.85  
2             5.89       -5.89  
3             6.02       -6.02  
4             6.06       -6.06  
..             ...         ...  
554         100.00       -0.01  
555         100.00       -0.01  
556         100.00       -0.01  
557         100.00        0.00  
558         100.00        0.00  

[559 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 44.02
AUC metric on the TEST data is: 0.81
Gini metric on the TEST data is: 0.63
In [116]:
calculate_and_plot_psi_nn(X_train_nn, X_test_segmented_final_int, model_opt, 'Training', 'Test')
2697/2697 ━━━━━━━━━━━━━━━━━━━━ 2s 577us/step
1342/1342 ━━━━━━━━━━━━━━━━━━━━ 1s 574us/step
The PSI statistic between Training and Test sets is: 0.145
Moderate shift in the population (PSI = 0.145)